349 files changed, 11822 insertions, 2812 deletions
diff --git a/clang/include/clang/Basic/BuiltinsHexagon.td b/clang/include/clang/Basic/BuiltinsHexagon.td
index 0727c67..cf18359 100644
--- a/clang/include/clang/Basic/BuiltinsHexagon.td
+++ b/clang/include/clang/Basic/BuiltinsHexagon.td
@@ -20,8 +20,9 @@ class V<string version, VFeatures newer> : VFeatures {
   let Features = !strconcat("v", version, "|", newer.Features);
 }
 
-let Features = "v79" in def V79 : VFeatures;
+let Features = "v81" in def V81 : VFeatures;
 
+def V79 : V<"79", V81>;
 def V75 : V<"75", V79>;
 def V73 : V<"73", V75>;
 def V71 : V<"71", V73>;
@@ -43,8 +44,9 @@ class HVXV<string version, HVXVFeatures newer> : HVXVFeatures {
   let Features = !strconcat("hvxv", version, "|", newer.Features);
 }
 
-let Features = "hvxv79" in def HVXV79 : HVXVFeatures;
+let Features = "hvxv81" in def HVXV81 : HVXVFeatures;
 
+def HVXV79 : HVXV<"79", HVXV81>;
 def HVXV75 : HVXV<"75", HVXV79>;
 def HVXV73 : HVXV<"73", HVXV75>;
 def HVXV71 : HVXV<"71", HVXV73>;
diff --git a/clang/include/clang/Basic/FileManager.h b/clang/include/clang/Basic/FileManager.h
index 337911e..fa7552b 100644
--- a/clang/include/clang/Basic/FileManager.h
+++ b/clang/include/clang/Basic/FileManager.h
@@ -287,8 +287,12 @@ public:
   /// If path is not absolute and FileSystemOptions set the working
   /// directory, the path is modified to be relative to the given
   /// working directory.
-  /// \returns true if \c path changed.
-  bool FixupRelativePath(SmallVectorImpl<char> &path) const;
+  /// \returns true if \c Path changed.
+  bool FixupRelativePath(SmallVectorImpl<char> &Path) const {
+    return fixupRelativePath(FileSystemOpts, Path);
+  }
+  static bool fixupRelativePath(const FileSystemOptions &FileSystemOpts,
+                                SmallVectorImpl<char> &Path);
 
   /// Makes \c Path absolute taking into account FileSystemOptions and the
   /// working directory option.
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 598e826a..369ac3d 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -231,6 +231,7 @@ struct MissingFeatures {
   static bool coverageMapping() { return false; }
   static bool createInvariantGroup() { return false; }
   static bool createProfileWeightsForLoop() { return false; }
+  static bool ctorConstLvalueToRvalueConversion() { return false; }
   static bool ctorMemcpyizer() { return false; }
   static bool cudaSupport() { return false; }
   static bool cxxRecordStaticMembers() { return false; }
@@ -240,6 +241,7 @@ struct MissingFeatures {
   static bool dataLayoutPtrHandlingBasedOnLangAS() { return false; }
   static bool deferredCXXGlobalInit() { return false; }
   static bool deleteArray() { return false; }
+  static bool devirtualizeDestructor() { return false; }
   static bool devirtualizeMemberFunction() { return false; }
   static bool ehCleanupFlags() { return false; }
   static bool ehCleanupHasPrebranchedFallthrough() { return false; }
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0c9584f..93aeb22 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6558,6 +6558,8 @@ def mv75 : Flag<["-"], "mv75">, Group<m_hexagon_Features_Group>,
   Alias<mcpu_EQ>, AliasArgs<["hexagonv75"]>;
 def mv79 : Flag<["-"], "mv79">, Group<m_hexagon_Features_Group>,
   Alias<mcpu_EQ>, AliasArgs<["hexagonv79"]>;
+def mv81 : Flag<["-"], "mv81">, Group<m_hexagon_Features_Group>,
+  Alias<mcpu_EQ>, AliasArgs<["hexagonv81"]>;
 def mhexagon_hvx : Flag<["-"], "mhvx">, Group<m_hexagon_Features_HVX_Group>,
   HelpText<"Enable Hexagon Vector eXtensions">;
 def mhexagon_hvx_EQ : Joined<["-"], "mhvx=">,
diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index 7481e1e..e744cc0 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -474,8 +474,9 @@ OptionalFileEntryRef FileManager::getBypassFile(FileEntryRef VF) {
   return FileEntryRef(*Insertion.first);
 }
 
-bool FileManager::FixupRelativePath(SmallVectorImpl<char> &path) const {
-  StringRef pathRef(path.data(), path.size());
+bool FileManager::fixupRelativePath(const FileSystemOptions &FileSystemOpts,
+                                    SmallVectorImpl<char> &Path) {
+  StringRef pathRef(Path.data(), Path.size());
 
   if (FileSystemOpts.WorkingDir.empty()
       || llvm::sys::path::is_absolute(pathRef))
@@ -483,7 +484,7 @@ bool FileManager::FixupRelativePath(SmallVectorImpl<char> &path) const {
 
   SmallString<128> NewPath(FileSystemOpts.WorkingDir);
   llvm::sys::path::append(NewPath, pathRef);
-  path = NewPath;
+  Path = NewPath;
   return true;
 }
 
diff --git a/clang/lib/Basic/Targets/Hexagon.cpp b/clang/lib/Basic/Targets/Hexagon.cpp
index cea64f9..d5b413cb 100644
--- a/clang/lib/Basic/Targets/Hexagon.cpp
+++ b/clang/lib/Basic/Targets/Hexagon.cpp
@@ -83,6 +83,9 @@ void HexagonTargetInfo::getTargetDefines(const LangOptions &Opts,
   } else if (CPU == "hexagonv79") {
     Builder.defineMacro("__HEXAGON_V79__");
     Builder.defineMacro("__HEXAGON_ARCH__", "79");
+  } else if (CPU == "hexagonv81") {
+    Builder.defineMacro("__HEXAGON_V81__");
+    Builder.defineMacro("__HEXAGON_ARCH__", "81");
   }
 
   if (hasFeature("hvx-length64b")) {
@@ -252,8 +255,7 @@ static constexpr CPUSuffix Suffixes[] = {
     {{"hexagonv68"}, {"68"}}, {{"hexagonv69"}, {"69"}},
     {{"hexagonv71"}, {"71"}}, {{"hexagonv71t"}, {"71t"}},
     {{"hexagonv73"}, {"73"}}, {{"hexagonv75"}, {"75"}},
-    {{"hexagonv79"}, {"79"}},
-};
+    {{"hexagonv79"}, {"79"}}, {{"hexagonv81"}, {"81"}}};
 
 std::optional<unsigned> HexagonTargetInfo::getHexagonCPURev(StringRef Name) {
   StringRef Arch = Name;
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
index d3c7dac0..13dc9f3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
@@ -187,6 +187,11 @@ public:
   virtual void registerGlobalDtor(const VarDecl *vd, cir::FuncOp dtor,
                                   mlir::Value addr) = 0;
 
+  virtual void emitVirtualObjectDelete(CIRGenFunction &cgf,
+                                       const CXXDeleteExpr *de, Address ptr,
+                                       QualType elementType,
+                                       const CXXDestructorDecl *dtor) = 0;
+
   /// Checks if ABI requires extra virtual offset for vtable field.
   virtual bool
   isVirtualOffsetNeededForVTableField(CIRGenFunction &cgf,
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index d6d226b..8fe0d9b4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -362,8 +362,7 @@ public:
     cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitCXXTypeidExpr");
   }
   void VisitMaterializeTemporaryExpr(MaterializeTemporaryExpr *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(),
-                     "AggExprEmitter: VisitMaterializeTemporaryExpr");
+    Visit(e->getSubExpr());
   }
   void VisitOpaqueValueExpr(OpaqueValueExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(),
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index a3cdf19..7a35382 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -565,8 +565,10 @@ static void emitObjectDelete(CIRGenFunction &cgf, const CXXDeleteExpr *de,
       dtor = rd->getDestructor();
 
       if (dtor->isVirtual()) {
-        cgf.cgm.errorNYI(de->getSourceRange(),
-                         "emitObjectDelete: virtual destructor");
+        assert(!cir::MissingFeatures::devirtualizeDestructor());
+        cgf.cgm.getCXXABI().emitVirtualObjectDelete(cgf, de, ptr, elementType,
+                                                    dtor);
+        return;
       }
     }
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 95905ce..928e5aa 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -1078,9 +1078,32 @@ public:
 
   mlir::Attribute VisitCXXConstructExpr(CXXConstructExpr *e, QualType ty) {
     if (!e->getConstructor()->isTrivial())
-      return nullptr;
-    cgm.errorNYI(e->getBeginLoc(), "trivial constructor const handling");
-    return {};
+      return {};
+
+    // Only default and copy/move constructors can be trivial.
+    if (e->getNumArgs()) {
+      assert(e->getNumArgs() == 1 && "trivial ctor with > 1 argument");
+      assert(e->getConstructor()->isCopyOrMoveConstructor() &&
+             "trivial ctor has argument but isn't a copy/move ctor");
+
+      Expr *arg = e->getArg(0);
+      assert(cgm.getASTContext().hasSameUnqualifiedType(ty, arg->getType()) &&
+             "argument to copy ctor is of wrong type");
+
+      // Look through the temporary; it's just converting the value to an lvalue
+      // to pass it to the constructor.
+      if (auto const *mte = dyn_cast<MaterializeTemporaryExpr>(arg))
+        return Visit(mte->getSubExpr(), ty);
+
+      // TODO: Investigate whether there are cases that can fall through to here
+      //       that need to be handled. This is missing in classic codegen also.
+      assert(!cir::MissingFeatures::ctorConstLvalueToRvalueConversion());
+
+      // Don't try to support arbitrary lvalue-to-rvalue conversions for now.
+      return {};
+    }
+
+    return cgm.getBuilder().getZeroInitAttr(cgm.convertType(ty));
   }
 
   mlir::Attribute VisitStringLiteral(StringLiteral *e, QualType t) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index 2dce0b1..88fedf1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -74,6 +74,9 @@ public:
                           QualType thisTy) override;
   void registerGlobalDtor(const VarDecl *vd, cir::FuncOp dtor,
                           mlir::Value addr) override;
+  void emitVirtualObjectDelete(CIRGenFunction &cgf, const CXXDeleteExpr *de,
+                               Address ptr, QualType elementType,
+                               const CXXDestructorDecl *dtor) override;
 
   void emitRethrow(CIRGenFunction &cgf, bool isNoReturn) override;
   void emitThrow(CIRGenFunction &cgf, const CXXThrowExpr *e) override;
@@ -2175,6 +2178,21 @@ mlir::Value CIRGenItaniumCXXABI::emitDynamicCast(CIRGenFunction &cgf,
                                         isRefCast, castInfo);
 }
 
+/// The Itanium ABI always places an offset to the complete object
+/// at entry -2 in the vtable.
+void CIRGenItaniumCXXABI::emitVirtualObjectDelete(
+    CIRGenFunction &cgf, const CXXDeleteExpr *delExpr, Address ptr,
+    QualType elementType, const CXXDestructorDecl *dtor) {
+  bool useGlobalDelete = delExpr->isGlobalDelete();
+  if (useGlobalDelete) {
+    cgf.cgm.errorNYI(delExpr->getSourceRange(),
+                     "emitVirtualObjectDelete: global delete");
+  }
+
+  CXXDtorType dtorType = useGlobalDelete ? Dtor_Complete : Dtor_Deleting;
+  emitVirtualDestructorCall(cgf, dtor, dtorType, ptr, delExpr);
+}
+
 /************************** Array allocation cookies **************************/
 
 CharUnits CIRGenItaniumCXXABI::getArrayCookieSizeImpl(QualType elementType) {
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 146dc8b..31c2f3f 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -481,9 +481,11 @@ SanitizerMask Fuchsia::getSupportedSanitizers() const {
   Res |= SanitizerKind::Fuzzer;
   Res |= SanitizerKind::FuzzerNoLink;
   Res |= SanitizerKind::Leak;
-  Res |= SanitizerKind::SafeStack;
   Res |= SanitizerKind::Scudo;
   Res |= SanitizerKind::Thread;
+  if (getTriple().getArch() == llvm::Triple::x86_64) {
+    Res |= SanitizerKind::SafeStack;
+  }
   return Res;
 }
 
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index e3bf0ea..6b09f7f 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -882,7 +882,7 @@ CompilerInstance::createOutputFileImpl(StringRef OutputPath, bool Binary,
            "File Manager is required to fix up relative path.\n");
 
     AbsPath.emplace(OutputPath);
-    FileMgr->FixupRelativePath(*AbsPath);
+    FileManager::fixupRelativePath(getFileSystemOpts(), *AbsPath);
     OutputPath = *AbsPath;
   }
 
diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp
index 51f2aef..c727ee3 100644
--- a/clang/lib/Parse/ParseHLSL.cpp
+++ b/clang/lib/Parse/ParseHLSL.cpp
@@ -126,15 +126,9 @@ Parser::ParsedSemantic Parser::ParseHLSLSemantic() {
   // semantic index. The semantic index is the number at the end of
   // the semantic, including leading zeroes. Digits located before
   // the last letter are part of the semantic name.
-  bool Invalid = false;
   SmallString<256> Buffer;
   Buffer.resize(Tok.getLength() + 1);
   StringRef Identifier = PP.getSpelling(Tok, Buffer);
-  if (Invalid) {
-    Diag(Tok.getLocation(), diag::err_expected_semantic_identifier);
-    return {};
-  }
-
   assert(Identifier.size() > 0);
   // Determine the start of the semantic index.
   unsigned IndexIndex = Identifier.find_last_not_of("0123456789") + 1;
diff --git a/clang/lib/Sema/SemaObjCProperty.cpp b/clang/lib/Sema/SemaObjCProperty.cpp
index 1880cec..67c554c 100644
--- a/clang/lib/Sema/SemaObjCProperty.cpp
+++ b/clang/lib/Sema/SemaObjCProperty.cpp
@@ -1041,7 +1041,7 @@ RedeclarePropertyAccessor(ASTContext &Context, ObjCImplementationDecl *Impl,
       Decl->getSelector(), Decl->getReturnType(),
       Decl->getReturnTypeSourceInfo(), Impl, Decl->isInstanceMethod(),
       Decl->isVariadic(), Decl->isPropertyAccessor(),
-      /* isSynthesized*/ true, Decl->isImplicit(), Decl->isDefined(),
+      /*isSynthesizedAccessorStub=*/true, Decl->isImplicit(), Decl->isDefined(),
       Decl->getImplementationControl(), Decl->hasRelatedResultType());
   ImplDecl->getMethodFamily();
   if (Decl->hasAttrs())
diff --git a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
index 82b560b..e0deec1 100644
--- a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
@@ -128,7 +128,6 @@ public:
   std::unique_ptr<llvm::Timer> SyntaxCheckTimer;
   std::unique_ptr<llvm::Timer> ExprEngineTimer;
   std::unique_ptr<llvm::Timer> BugReporterTimer;
-  bool ShouldClearTimersToPreventDisplayingThem;
 
   /// The information about analyzed functions shared throughout the
   /// translation unit.
@@ -149,7 +148,10 @@ public:
     if (Opts.AnalyzerDisplayProgress || Opts.PrintStats ||
         Opts.ShouldSerializeStats || !Opts.DumpEntryPointStatsToCSV.empty()) {
       AnalyzerTimers = std::make_unique<llvm::TimerGroup>(
-          "analyzer", "Analyzer timers");
+          "analyzer", "Analyzer timers",
+          /*PrintOnExit=*/
+          (Opts.AnalyzerDisplayProgress || Opts.PrintStats ||
+           Opts.ShouldSerializeStats));
       SyntaxCheckTimer = std::make_unique<llvm::Timer>(
           "syntaxchecks", "Syntax-based analysis time", *AnalyzerTimers);
       ExprEngineTimer = std::make_unique<llvm::Timer>(
@@ -159,12 +161,6 @@ public:
           *AnalyzerTimers);
     }
 
-    // Avoid displaying the timers created above in case we only want to record
-    // per-entry-point stats.
-    ShouldClearTimersToPreventDisplayingThem = !Opts.AnalyzerDisplayProgress &&
-                                               !Opts.PrintStats &&
-                                               !Opts.ShouldSerializeStats;
-
     if (Opts.PrintStats || Opts.ShouldSerializeStats) {
       llvm::EnableStatistics(/* DoPrintOnExit= */ false);
     }
@@ -287,9 +283,6 @@ public:
       checkerMgr->runCheckersOnASTDecl(D, *Mgr, *RecVisitorBR);
       if (SyntaxCheckTimer)
         SyntaxCheckTimer->stopTimer();
-      if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
-        AnalyzerTimers->clear();
-      }
     }
     return true;
   }
@@ -583,9 +576,6 @@ void AnalysisConsumer::runAnalysisOnTranslationUnit(ASTContext &C) {
   checkerMgr->runCheckersOnASTDecl(TU, *Mgr, BR);
   if (SyntaxCheckTimer)
     SyntaxCheckTimer->stopTimer();
-  if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
-    AnalyzerTimers->clear();
-  }
 
   // Run the AST-only checks using the order in which functions are defined.
   // If inlining is not turned on, use the simplest function order for path
@@ -765,9 +755,6 @@ void AnalysisConsumer::HandleCode(Decl *D, AnalysisMode Mode,
       FunctionSummaries.findOrInsertSummary(D)->second.SyntaxRunningTime =
           std::lround(CheckerDuration.getWallTime() * 1000);
       DisplayTime(CheckerDuration);
-      if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
-        AnalyzerTimers->clear();
-      }
     }
   }
 
@@ -830,9 +817,6 @@ void AnalysisConsumer::RunPathSensitiveChecks(Decl *D,
     PathRunningTime.set(static_cast<unsigned>(
         std::lround(ExprEngineDuration.getWallTime() * 1000)));
     DisplayTime(ExprEngineDuration);
-    if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
-      AnalyzerTimers->clear();
-    }
   }
 
   if (!Mgr->options.DumpExplodedGraphTo.empty())
@@ -843,9 +827,6 @@ void AnalysisConsumer::RunPathSensitiveChecks(Decl *D,
     Eng.ViewGraph(Mgr->options.TrimGraph);
 
   flushReports(BugReporterTimer.get(), Eng.getBugReporter());
-  if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
-    AnalyzerTimers->clear();
-  }
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/CIR/CodeGen/delete.cpp b/clang/test/CIR/CodeGen/delete.cpp
index 69640aa..d8ac436 100644
--- a/clang/test/CIR/CodeGen/delete.cpp
+++ b/clang/test/CIR/CodeGen/delete.cpp
@@ -86,3 +86,42 @@ Container::~Container() { delete contents; }
 // These functions are declared/defined below the calls in OGCG.
 // OGCG: define linkonce_odr void @_ZN8ContentsD2Ev
 // OGCG: declare void @_ZdlPvm(ptr noundef, i64 noundef)
+
+struct StructWithVirtualDestructor {
+  virtual ~StructWithVirtualDestructor();
+};
+
+void destroy(StructWithVirtualDestructor *x) {
+  delete x;
+}
+
+// CIR: cir.func {{.*}} @_Z7destroyP27StructWithVirtualDestructor(%[[X_ARG:.*]]: !cir.ptr<!rec_StructWithVirtualDestructor> {{.*}})
+// CIR:   %[[X_ADDR:.*]] = cir.alloca !cir.ptr<!rec_StructWithVirtualDestructor>
+// CIR:   cir.store %[[X_ARG]], %[[X_ADDR]]
+// CIR:   %[[X:.*]] = cir.load{{.*}} %[[X_ADDR]]
+// CIR:   %[[VTABLE_PTR:.*]] = cir.vtable.get_vptr %[[X]] : !cir.ptr<!rec_StructWithVirtualDestructor> -> !cir.ptr<!cir.vptr>
+// CIR:   %[[VTABLE:.*]] = cir.load{{.*}} %[[VTABLE_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR:   %[[DTOR_FN_ADDR_PTR:.*]] = cir.vtable.get_virtual_fn_addr %[[VTABLE]][1]
+// CIR:   %[[DTOR_FN_ADDR:.*]] = cir.load{{.*}} %[[DTOR_FN_ADDR_PTR]]
+// CIR:   cir.call %[[DTOR_FN_ADDR]](%[[X]])
+
+// LLVM: define {{.*}} void @_Z7destroyP27StructWithVirtualDestructor(ptr %[[X_ARG:.*]])
+// LLVM:   %[[X_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[X_ARG]], ptr %[[X_ADDR]]
+// LLVM:   %[[X:.*]] = load ptr, ptr %[[X_ADDR]]
+// LLVM:   %[[VTABLE:.*]] = load ptr, ptr %[[X]]
+// LLVM:   %[[DTOR_FN_ADDR_PTR:.*]] = getelementptr inbounds ptr, ptr %[[VTABLE]], i32 1
+// LLVM:   %[[DTOR_FN_ADDR:.*]] = load ptr, ptr %[[DTOR_FN_ADDR_PTR]]
+// LLVM:   call void %[[DTOR_FN_ADDR]](ptr %[[X]])
+
+// OGCG: define {{.*}} void @_Z7destroyP27StructWithVirtualDestructor(ptr {{.*}} %[[X_ARG:.*]])
+// OGCG:   %[[X_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[X_ARG]], ptr %[[X_ADDR]]
+// OGCG:   %[[X:.*]] = load ptr, ptr %[[X_ADDR]]
+// OGCG:   %[[ISNULL:.*]] = icmp eq ptr %[[X]], null
+// OGCG:   br i1 %[[ISNULL]], label %{{.*}}, label %[[DELETE_NOTNULL:.*]]
+// OGCG: [[DELETE_NOTNULL]]:
+// OGCG:   %[[VTABLE:.*]] = load ptr, ptr %[[X]]
+// OGCG:   %[[DTOR_FN_ADDR_PTR:.*]] = getelementptr inbounds ptr, ptr %[[VTABLE]], i64 1
+// OGCG:   %[[DTOR_FN_ADDR:.*]] = load ptr, ptr %[[DTOR_FN_ADDR_PTR]]
+// OGCG:   call void %[[DTOR_FN_ADDR]](ptr {{.*}} %[[X]])
diff --git a/clang/test/CIR/CodeGen/trivial-ctor-const-init.cpp b/clang/test/CIR/CodeGen/trivial-ctor-const-init.cpp
new file mode 100644
index 0000000..7429549
--- /dev/null
+++ b/clang/test/CIR/CodeGen/trivial-ctor-const-init.cpp
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++11 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++11 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++11 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+struct StructWithDefaultCtor {
+  int n;
+};
+
+StructWithDefaultCtor defCtor = StructWithDefaultCtor();
+
+// CIR: cir.global {{.*}} @defCtor = #cir.zero : !rec_StructWithDefaultCtor
+// LLVM: @defCtor = global %struct.StructWithDefaultCtor zeroinitializer
+// OGCG: @defCtor = global %struct.StructWithDefaultCtor zeroinitializer
+
+struct StructWithCtorArg {
+  double value;
+  StructWithCtorArg(const double& x) : value(x) {}
+};
+
+StructWithCtorArg withArg = 0.0;
+
+// CIR: cir.global {{.*}} @withArg = #cir.zero : !rec_StructWithCtorArg
+// LLVM: @withArg = global %struct.StructWithCtorArg zeroinitializer
+// OGCG: @withArg = global %struct.StructWithCtorArg zeroinitializer
+
+// CIR: cir.func {{.*}} @__cxx_global_var_init()
+// CIR:   %[[TMP0:.*]] = cir.alloca !cir.double, !cir.ptr<!cir.double>, ["ref.tmp0"]
+// CIR:   %[[WITH_ARG:.*]] = cir.get_global @withArg : !cir.ptr<!rec_StructWithCtorArg>
+// CIR:   %[[ZERO:.*]] = cir.const #cir.fp<0.000000e+00> : !cir.double
+// CIR:   cir.store{{.*}} %[[ZERO]], %[[TMP0]] : !cir.double, !cir.ptr<!cir.double>
+// CIR:   cir.call @_ZN17StructWithCtorArgC1ERKd(%[[WITH_ARG]], %[[TMP0]]) : (!cir.ptr<!rec_StructWithCtorArg>, !cir.ptr<!cir.double>) -> ()
+
+// LLVM: define {{.*}} void @__cxx_global_var_init()
+// LLVM:   %[[TMP0:.*]] = alloca double
+// LLVM:   store double 0.000000e+00, ptr %[[TMP0]]
+// LLVM:   call void @_ZN17StructWithCtorArgC1ERKd(ptr @withArg, ptr %[[TMP0]])
+
+// OGCG: define {{.*}} void @__cxx_global_var_init()
+// OGCG:   %[[TMP0:.*]] = alloca double
+// OGCG:   store double 0.000000e+00, ptr %[[TMP0]]
+// OGCG:   call void @_ZN17StructWithCtorArgC1ERKd(ptr {{.*}} @withArg, ptr {{.*}} %[[TMP0]])
diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c
index 3fb2a94..d0fec18 100644
--- a/clang/test/Driver/fuchsia.c
+++ b/clang/test/Driver/fuchsia.c
@@ -312,3 +312,13 @@
 // RUN:     | FileCheck %s -check-prefix=CHECK-NOSTDLIB-NOLIBC
 // CHECK-NOSTDLIB-NOLIBC-NOT: "warning:"
 // CHECK-NOSTDLIB-NOLIBC-NOT: "error:"
+
+// RUN: not %clang -### %s --target=aarch64-unknown-fuchsia \
+// RUN:     -fsanitize=safe-stack 2>&1 \
+// RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
+// RUN:     | FileCheck %s -check-prefix=CHECK-NONX86-SAFESTACK
+// RUN: not %clang -### %s --target=riscv64-unknown-fuchsia \
+// RUN:     -fsanitize=safe-stack 2>&1 \
+// RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
+// RUN:     | FileCheck %s -check-prefix=CHECK-NONX86-SAFESTACK
+// CHECK-NONX86-SAFESTACK: error: unsupported option '-fsanitize=safe-stack' for target '{{.*}}'
diff --git a/clang/test/Driver/hexagon-toolchain-elf.c b/clang/test/Driver/hexagon-toolchain-elf.c
index de2ebfe..bff4819 100644
--- a/clang/test/Driver/hexagon-toolchain-elf.c
+++ b/clang/test/Driver/hexagon-toolchain-elf.c
@@ -166,6 +166,13 @@
 // CHECK250: "-cc1" {{.*}} "-target-cpu" "hexagonv79"
 // CHECK250: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v79/crt0
 
+// RUN: not %clang -### --target=hexagon-unknown-elf \
+// RUN:   -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \
+// RUN:   -mcpu=hexagonv81 -fuse-ld=hexagon-link \
+// RUN:   %s 2>&1 | FileCheck -check-prefix=CHECK260 %s
+// CHECK260: "-cc1" {{.*}} "-target-cpu" "hexagonv81"
+// CHECK260: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v81/crt0
+
 // -----------------------------------------------------------------------------
 // Test Linker related args
 // -----------------------------------------------------------------------------
diff --git a/clang/test/Preprocessor/hexagon-predefines.c b/clang/test/Preprocessor/hexagon-predefines.c
index eebf481..bdedcf4 100644
--- a/clang/test/Preprocessor/hexagon-predefines.c
+++ b/clang/test/Preprocessor/hexagon-predefines.c
@@ -171,6 +171,23 @@
 // CHECK-V79HVX-128B: #define __HVX__ 1
 // CHECK-V79HVX-128B: #define __hexagon__ 1
 
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv81 %s\
+// RUN: | FileCheck %s -check-prefix CHECK-V81
+// CHECK-V81: #define __HEXAGON_ARCH__ 81
+// CHECK-V81: #define __HEXAGON_PHYSICAL_SLOTS__ 4
+// CHECK-V81: #define __HEXAGON_V81__ 1
+// CHECK-V81: #define __hexagon__ 1
+
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv81 \
+// RUN: -target-feature +hvxv81 -target-feature +hvx-length128b %s | FileCheck \
+// RUN: %s -check-prefix CHECK-V81HVX-128B
+// CHECK-V81HVX-128B: #define __HEXAGON_ARCH__ 81
+// CHECK-V81HVX-128B: #define __HEXAGON_V81__ 1
+// CHECK-V81HVX-128B: #define __HVX_ARCH__ 81
+// CHECK-V81HVX-128B: #define __HVX_LENGTH__ 128
+// CHECK-V81HVX-128B: #define __HVX__ 1
+// CHECK-V81HVX-128B: #define __hexagon__ 1
+
 // RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv67 \
 // RUN: -target-feature +hvxv67 -target-feature +hvx-length128b %s | FileCheck \
 // RUN: %s -check-prefix CHECK-ELF
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index 49f8843..52cffa4 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -313,17 +313,6 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
   llvm::TimerGroup::clearAll();
 
   if (llvm::timeTraceProfilerEnabled()) {
-    // It is possible that the compiler instance doesn't own a file manager here
-    // if we're compiling a module unit. Since the file manager are owned by AST
-    // when we're compiling a module unit. So the file manager may be invalid
-    // here.
-    //
-    // It should be fine to create file manager here since the file system
-    // options are stored in the compiler invocation and we can recreate the VFS
-    // from the compiler invocation.
-    if (!Clang->hasFileManager())
-      Clang->createFileManager();
-
     if (auto profilerOutput = Clang->createOutputFile(
             Clang->getFrontendOpts().TimeTracePath, /*Binary=*/false,
             /*RemoveFileOnSignal=*/false,
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index 05d97a5..5436051 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -138,23 +138,21 @@ void RTDEF(CUFRegisterAllocator)() {
 void *CUFAllocPinned(
     std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   void *p;
-  CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
+  cudaMallocHost((void **)&p, sizeInBytes);
   return p;
 }
 
-void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
+void CUFFreePinned(void *p) { cudaFreeHost(p); }
 
 void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) {
   void *p;
   if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
-    CUDA_REPORT_IF_ERROR(
-        cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
+    cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal);
   } else {
     if (asyncObject == nullptr) {
-      CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+      cudaMalloc(&p, sizeInBytes);
     } else {
-      CUDA_REPORT_IF_ERROR(
-          cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject));
+      cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject);
       insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject);
     }
   }
@@ -167,21 +165,20 @@ void CUFFreeDevice(void *p) {
   if (pos >= 0) {
     cudaStream_t stream = deviceAllocations[pos].stream;
     eraseAllocation(pos);
-    CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
+    cudaFreeAsync(p, stream);
   } else {
-    CUDA_REPORT_IF_ERROR(cudaFree(p));
+    cudaFree(p);
   }
 }
 
 void *CUFAllocManaged(
     std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   void *p;
-  CUDA_REPORT_IF_ERROR(
-      cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
+  cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal);
   return reinterpret_cast<void *>(p);
 }
 
-void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
+void CUFFreeManaged(void *p) { cudaFree(p); }
 
 void *CUFAllocUnified(
     std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index a277690..7b52daf 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -166,7 +166,18 @@ add_header_macro(
   stdfix.h
   DEPENDS
     .llvm-libc-macros.stdfix_macros
-    .llvm-libc-types.stdfix-types
+    .llvm-libc-types.int_hk_t
+    .llvm-libc-types.int_hr_t
+    .llvm-libc-types.int_k_t
+    .llvm-libc-types.int_lk_t
+    .llvm-libc-types.int_lr_t
+    .llvm-libc-types.int_r_t
+    .llvm-libc-types.uint_uhk_t
+    .llvm-libc-types.uint_uhr_t
+    .llvm-libc-types.uint_uk_t
+    .llvm-libc-types.uint_ulk_t
+    .llvm-libc-types.uint_ulr_t
+    .llvm-libc-types.uint_ur_t
 )
 
 
diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt
index a428a0e..fcd0e12 100644
--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -174,7 +174,18 @@ DEPENDS
 )
 add_header(locale_t HDR locale_t.h)
 add_header(struct_lconv HDR struct_lconv.h)
-add_header(stdfix-types HDR stdfix-types.h)
+add_header(int_hk_t HDR int_hk_t.h)
+add_header(int_hr_t HDR int_hr_t.h)
+add_header(int_k_t HDR int_k_t.h)
+add_header(int_lk_t HDR int_lk_t.h)
+add_header(int_lr_t HDR int_lr_t.h)
+add_header(int_r_t HDR int_r_t.h)
+add_header(uint_uhk_t HDR uint_uhk_t.h)
+add_header(uint_uhr_t HDR uint_uhr_t.h)
+add_header(uint_uk_t HDR uint_uk_t.h)
+add_header(uint_ulk_t HDR uint_ulk_t.h)
+add_header(uint_ulr_t HDR uint_ulr_t.h)
+add_header(uint_ur_t HDR uint_ur_t.h)
 
 # UEFI
 add_header(EFI_GUID HDR EFI_GUID.h DEPENDS libc.include.llvm-libc-macros.stdint_macros)
diff --git a/libc/include/llvm-libc-types/int_hk_t.h b/libc/include/llvm-libc-types/int_hk_t.h
new file mode 100644
index 0000000..eb17224
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_hk_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_HK_T_H
+#define LLVM_LIBC_TYPES_INT_HK_T_H
+
+typedef signed short int_hk_t;
+
+#endif // LLVM_LIBC_TYPES_INT_HK_T_H
diff --git a/libc/include/llvm-libc-types/int_hr_t.h b/libc/include/llvm-libc-types/int_hr_t.h
new file mode 100644
index 0000000..89b57a8
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_hr_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_HR_T_H
+#define LLVM_LIBC_TYPES_INT_HR_T_H
+
+typedef signed char int_hr_t;
+
+#endif // LLVM_LIBC_TYPES_INT_HR_T_H
diff --git a/libc/include/llvm-libc-types/int_k_t.h b/libc/include/llvm-libc-types/int_k_t.h
new file mode 100644
index 0000000..193f9d8
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_k_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_K_T_H
+#define LLVM_LIBC_TYPES_INT_K_T_H
+
+typedef signed int int_k_t;
+
+#endif // LLVM_LIBC_TYPES_INT_K_T_H
diff --git a/libc/include/llvm-libc-types/int_lk_t.h b/libc/include/llvm-libc-types/int_lk_t.h
new file mode 100644
index 0000000..c8f3137
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_lk_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_LK_T_H
+#define LLVM_LIBC_TYPES_INT_LK_T_H
+
+typedef signed long long int_lk_t;
+
+#endif // LLVM_LIBC_TYPES_INT_LK_T_H
diff --git a/libc/include/llvm-libc-types/int_lr_t.h b/libc/include/llvm-libc-types/int_lr_t.h
new file mode 100644
index 0000000..f2987e8
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_lr_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_LR_T_H
+#define LLVM_LIBC_TYPES_INT_LR_T_H
+
+typedef signed int int_lr_t;
+
+#endif // LLVM_LIBC_TYPES_INT_LR_T_H
diff --git a/libc/include/llvm-libc-types/int_r_t.h b/libc/include/llvm-libc-types/int_r_t.h
new file mode 100644
index 0000000..6b528f4
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_r_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_R_T_H
+#define LLVM_LIBC_TYPES_INT_R_T_H
+
+typedef signed short int int_r_t;
+
+#endif // LLVM_LIBC_TYPES_INT_R_T_H
diff --git a/libc/include/llvm-libc-types/stdfix-types.h b/libc/include/llvm-libc-types/stdfix-types.h
deleted file mode 100644
index 542d45e..0000000
--- a/libc/include/llvm-libc-types/stdfix-types.h
+++ /dev/null
@@ -1,25 +0,0 @@
-//===-- Definition of stdfix integer types --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_TYPES_STDFIX_TYPES_H
-#define LLVM_LIBC_TYPES_STDFIX_TYPES_H
-
-typedef signed char int_hr_t;
-typedef signed short int int_r_t;
-typedef signed int int_lr_t;
-typedef signed short int_hk_t;
-typedef signed int int_k_t;
-typedef signed long long int_lk_t;
-typedef unsigned char uint_uhr_t;
-typedef unsigned short int uint_ur_t;
-typedef unsigned int uint_ulr_t;
-typedef unsigned short int uint_uhk_t;
-typedef unsigned int uint_uk_t;
-typedef unsigned long long uint_ulk_t;
-
-#endif // LLVM_LIBC_TYPES_STDFIX_TYPES_H
diff --git a/libc/include/llvm-libc-types/uint_uhk_t.h b/libc/include/llvm-libc-types/uint_uhk_t.h
new file mode 100644
index 0000000..ed0f427
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_uhk_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_UHK_T_H
+#define LLVM_LIBC_TYPES_UINT_UHK_T_H
+
+typedef unsigned short int uint_uhk_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_UHK_T_H
diff --git a/libc/include/llvm-libc-types/uint_uhr_t.h b/libc/include/llvm-libc-types/uint_uhr_t.h
new file mode 100644
index 0000000..e3eab1c
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_uhr_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_UHR_T_H
+#define LLVM_LIBC_TYPES_UINT_UHR_T_H
+
+typedef unsigned char uint_uhr_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_UHR_T_H
diff --git a/libc/include/llvm-libc-types/uint_uk_t.h b/libc/include/llvm-libc-types/uint_uk_t.h
new file mode 100644
index 0000000..f9bbc7b
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_uk_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_UK_T_H
+#define LLVM_LIBC_TYPES_UINT_UK_T_H
+
+typedef unsigned int uint_uk_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_UK_T_H
diff --git a/libc/include/llvm-libc-types/uint_ulk_t.h b/libc/include/llvm-libc-types/uint_ulk_t.h
new file mode 100644
index 0000000..b7a54b0
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_ulk_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_ULK_T_H
+#define LLVM_LIBC_TYPES_UINT_ULK_T_H
+
+typedef unsigned long long uint_ulk_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_ULK_T_H
diff --git a/libc/include/llvm-libc-types/uint_ulr_t.h b/libc/include/llvm-libc-types/uint_ulr_t.h
new file mode 100644
index 0000000..9f4682c
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_ulr_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_ULR_T_H
+#define LLVM_LIBC_TYPES_UINT_ULR_T_H
+
+typedef unsigned int uint_ulr_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_ULR_T_H
diff --git a/libc/include/llvm-libc-types/uint_ur_t.h b/libc/include/llvm-libc-types/uint_ur_t.h
new file mode 100644
index 0000000..267c9c4
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_ur_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_UR_T_H
+#define LLVM_LIBC_TYPES_UINT_UR_T_H
+
+typedef unsigned short int uint_ur_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_UR_T_H
diff --git a/libc/include/stdfix.yaml b/libc/include/stdfix.yaml
index 451330c..9f1a32f 100644
--- a/libc/include/stdfix.yaml
+++ b/libc/include/stdfix.yaml
@@ -8,7 +8,19 @@ macros:
   - macro_name: roundfx
     macro_header: stdfix-macros.h
 types:
-  - type_name: stdfix-types
+  - type_name: int_hk_t
+  - type_name: int_hr_t
+  - type_name: int_k_t
+  - type_name: int_lk_t
+  - type_name: int_lr_t
+  - type_name: int_r_t
+  - type_name: uint_uhk_t
+  - type_name: uint_uhr_t
+  - type_name: uint_uk_t
+  - type_name: uint_ulk_t
+  - type_name: uint_ulr_t
+  - type_name: uint_ur_t
+
 enums: []
 objects: []
 functions:
diff --git a/libc/src/stdfix/CMakeLists.txt b/libc/src/stdfix/CMakeLists.txt
index 3cbabd1..e9841cd 100644
--- a/libc/src/stdfix/CMakeLists.txt
+++ b/libc/src/stdfix/CMakeLists.txt
@@ -58,7 +58,6 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk)
       ${libc_opt_high_flag}
     DEPENDS
       libc.src.__support.fixed_point.fx_bits
-      libc.include.llvm-libc-types.stdfix-types
       libc.include.llvm-libc-macros.stdfix_macros
   )
 
diff --git a/libc/src/stdfix/bitshk.cpp b/libc/src/stdfix/bitshk.cpp
index d0a3e128..be4e71a 100644
--- a/libc/src/stdfix/bitshk.cpp
+++ b/libc/src/stdfix/bitshk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitshk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // short accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_hk_t
+#include "include/llvm-libc-types/int_hk_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitshk.h b/libc/src/stdfix/bitshk.h
index a1505e2..2439576 100644
--- a/libc/src/stdfix/bitshk.h
+++ b/libc/src/stdfix/bitshk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSHK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // short accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_hk_t
+#include "include/llvm-libc-types/int_hk_t.h"       // int_hk_t
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitshr.cpp b/libc/src/stdfix/bitshr.cpp
index 394d1f0..e70bf0a 100644
--- a/libc/src/stdfix/bitshr.cpp
+++ b/libc/src/stdfix/bitshr.cpp
@@ -8,7 +8,7 @@
 
 #include "bitshr.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // short fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_hr_t
+#include "include/llvm-libc-types/int_hr_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitshr.h b/libc/src/stdfix/bitshr.h
index d5b4b8f..c1517a9 100644
--- a/libc/src/stdfix/bitshr.h
+++ b/libc/src/stdfix/bitshr.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSHR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // short fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_hr_t
+#include "include/llvm-libc-types/int_hr_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsk.cpp b/libc/src/stdfix/bitsk.cpp
index f8c9d77..0f1a3f3 100644
--- a/libc/src/stdfix/bitsk.cpp
+++ b/libc/src/stdfix/bitsk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_k_t
+#include "include/llvm-libc-types/int_k_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsk.h b/libc/src/stdfix/bitsk.h
index 32d5a72..3e41b43 100644
--- a/libc/src/stdfix/bitsk.h
+++ b/libc/src/stdfix/bitsk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_k_t
+#include "include/llvm-libc-types/int_k_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitslk.cpp b/libc/src/stdfix/bitslk.cpp
index f4af2a8..1065c32 100644
--- a/libc/src/stdfix/bitslk.cpp
+++ b/libc/src/stdfix/bitslk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitslk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // long accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_lk_t
+#include "include/llvm-libc-types/int_lk_t.h"       // int_lk_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitslk.h b/libc/src/stdfix/bitslk.h
index 821116b..e1a0c18 100644
--- a/libc/src/stdfix/bitslk.h
+++ b/libc/src/stdfix/bitslk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSLK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // long accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_lk_t
+#include "include/llvm-libc-types/int_lk_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitslr.cpp b/libc/src/stdfix/bitslr.cpp
index 3b38aa2..e8a4993 100644
--- a/libc/src/stdfix/bitslr.cpp
+++ b/libc/src/stdfix/bitslr.cpp
@@ -8,7 +8,7 @@
 
 #include "bitslr.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // long fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_lr_t
+#include "include/llvm-libc-types/int_lr_t.h"       // int_lr_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitslr.h b/libc/src/stdfix/bitslr.h
index 0cb5972..d0e468b6 100644
--- a/libc/src/stdfix/bitslr.h
+++ b/libc/src/stdfix/bitslr.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSLR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // long fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_lr_t
+#include "include/llvm-libc-types/int_lr_t.h"       // int_lr_t
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsr.cpp b/libc/src/stdfix/bitsr.cpp
index 2b6ad2c..4eefe28 100644
--- a/libc/src/stdfix/bitsr.cpp
+++ b/libc/src/stdfix/bitsr.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsr.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_r_t
+#include "include/llvm-libc-types/int_r_t.h"        // int_r_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsr.h b/libc/src/stdfix/bitsr.h
index e071f03..5742a59 100644
--- a/libc/src/stdfix/bitsr.h
+++ b/libc/src/stdfix/bitsr.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_r_t
+#include "include/llvm-libc-types/int_r_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsuhk.cpp b/libc/src/stdfix/bitsuhk.cpp
index 1b0bf59a..62f7cf3 100755
--- a/libc/src/stdfix/bitsuhk.cpp
+++ b/libc/src/stdfix/bitsuhk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsuhk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned short accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uhk_t
+#include "include/llvm-libc-types/uint_uhk_t.h"     // uint_uhk_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsuhk.h b/libc/src/stdfix/bitsuhk.h
index 1e80286..b308282 100755
--- a/libc/src/stdfix/bitsuhk.h
+++ b/libc/src/stdfix/bitsuhk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSUHK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned short accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uhk_t
+#include "include/llvm-libc-types/uint_uhk_t.h"     // uint_uhk_t
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsuhr.cpp b/libc/src/stdfix/bitsuhr.cpp
index 66152e1..236b2a0 100755
--- a/libc/src/stdfix/bitsuhr.cpp
+++ b/libc/src/stdfix/bitsuhr.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsuhr.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned short fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uhr_t
+#include "include/llvm-libc-types/uint_uhr_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsuhr.h b/libc/src/stdfix/bitsuhr.h
index 0311665..662cd9c 100755
--- a/libc/src/stdfix/bitsuhr.h
+++ b/libc/src/stdfix/bitsuhr.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSUHR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned short fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uhr_t
+#include "include/llvm-libc-types/uint_uhr_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsuk.cpp b/libc/src/stdfix/bitsuk.cpp
index b0a92bd..32a4fbe 100755
--- a/libc/src/stdfix/bitsuk.cpp
+++ b/libc/src/stdfix/bitsuk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsuk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uk_t
+#include "include/llvm-libc-types/uint_uk_t.h"      // uint_uk_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsuk.h b/libc/src/stdfix/bitsuk.h
index fce37e8..b73d64b 100755
--- a/libc/src/stdfix/bitsuk.h
+++ b/libc/src/stdfix/bitsuk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSUK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uk_t
+#include "include/llvm-libc-types/uint_uk_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsulk.cpp b/libc/src/stdfix/bitsulk.cpp
index b8f61a1..d1000da 100755
--- a/libc/src/stdfix/bitsulk.cpp
+++ b/libc/src/stdfix/bitsulk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsulk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned long accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ulk_t
+#include "include/llvm-libc-types/uint_ulk_t.h"     // uint_ulk_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsulk.h b/libc/src/stdfix/bitsulk.h
index 1bf681e..9a1a9e9 100755
--- a/libc/src/stdfix/bitsulk.h
+++ b/libc/src/stdfix/bitsulk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSLK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned long accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ulk_t
+#include "include/llvm-libc-types/uint_ulk_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsulr.cpp b/libc/src/stdfix/bitsulr.cpp
index 9fd1b15..4099b71 100755
--- a/libc/src/stdfix/bitsulr.cpp
+++ b/libc/src/stdfix/bitsulr.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsulr.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned long fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ulr_t
+#include "include/llvm-libc-types/uint_ulr_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsulr.h b/libc/src/stdfix/bitsulr.h
index cf0f6fb..7ca985d 100755
--- a/libc/src/stdfix/bitsulr.h
+++ b/libc/src/stdfix/bitsulr.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSULR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned long fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ulr_t
+#include "include/llvm-libc-types/uint_ulr_t.h"     // uint_ulr_t
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsur.cpp b/libc/src/stdfix/bitsur.cpp
index ffb52de..8d422d4 100755
--- a/libc/src/stdfix/bitsur.cpp
+++ b/libc/src/stdfix/bitsur.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsur.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ur_t
+#include "include/llvm-libc-types/uint_ur_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsur.h b/libc/src/stdfix/bitsur.h
index 4c938bb..c413ef6 100755
--- a/libc/src/stdfix/bitsur.h
+++ b/libc/src/stdfix/bitsur.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSUR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ur_t
+#include "include/llvm-libc-types/uint_ur_t.h"      // uint_ur_t
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitusk.cpp b/libc/src/stdfix/bitusk.cpp
index ac0852e..8309927 100755
--- a/libc/src/stdfix/bitusk.cpp
+++ b/libc/src/stdfix/bitusk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsuk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uk_t
+#include "include/llvm-libc-types/uint_uk_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/hkbits.h b/libc/src/stdfix/hkbits.h
index bf1bf82..f674492 100644
--- a/libc/src/stdfix/hkbits.h
+++ b/libc/src/stdfix/hkbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_HKBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_hk_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/hrbits.h b/libc/src/stdfix/hrbits.h
index c6bd99a..7cfda64 100644
--- a/libc/src/stdfix/hrbits.h
+++ b/libc/src/stdfix/hrbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_HRBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_hr_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/kbits.h b/libc/src/stdfix/kbits.h
index 7e709af..c45cf8d 100644
--- a/libc/src/stdfix/kbits.h
+++ b/libc/src/stdfix/kbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_KBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_k_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/lkbits.h b/libc/src/stdfix/lkbits.h
index 177f121..2d30bdb 100644
--- a/libc/src/stdfix/lkbits.h
+++ b/libc/src/stdfix/lkbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_LKBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_lk_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/lrbits.h b/libc/src/stdfix/lrbits.h
index 087aa19..000594ff 100644
--- a/libc/src/stdfix/lrbits.h
+++ b/libc/src/stdfix/lrbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_LRBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_lr_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/rbits.h b/libc/src/stdfix/rbits.h
index 4922e1c..df11cec 100644
--- a/libc/src/stdfix/rbits.h
+++ b/libc/src/stdfix/rbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_RBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_r_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/uhkbits.h b/libc/src/stdfix/uhkbits.h
index 7ef1598..3eef3aa 100644
--- a/libc/src/stdfix/uhkbits.h
+++ b/libc/src/stdfix/uhkbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_UHKBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_uhk_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/uhrbits.h b/libc/src/stdfix/uhrbits.h
index 4011a0e..7198e70 100644
--- a/libc/src/stdfix/uhrbits.h
+++ b/libc/src/stdfix/uhrbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_UHRBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_uhr_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/ukbits.h b/libc/src/stdfix/ukbits.h
index 2374f48..20302cd 100644
--- a/libc/src/stdfix/ukbits.h
+++ b/libc/src/stdfix/ukbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_UKBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_uk_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/ulkbits.h b/libc/src/stdfix/ulkbits.h
index 955c96c..04a74f3 100644
--- a/libc/src/stdfix/ulkbits.h
+++ b/libc/src/stdfix/ulkbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_ULKBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_ulk_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/ulrbits.h b/libc/src/stdfix/ulrbits.h
index 7ae49ba..e5d55ab 100644
--- a/libc/src/stdfix/ulrbits.h
+++ b/libc/src/stdfix/ulrbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_ULRBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_ulr_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/urbits.h b/libc/src/stdfix/urbits.h
index 8fc0881..ffa3b38 100644
--- a/libc/src/stdfix/urbits.h
+++ b/libc/src/stdfix/urbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_URBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_ur_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/test/src/stdfix/CMakeLists.txt b/libc/test/src/stdfix/CMakeLists.txt
index 7415222..04bedc3 100644
--- a/libc/test/src/stdfix/CMakeLists.txt
+++ b/libc/test/src/stdfix/CMakeLists.txt
@@ -4,7 +4,6 @@ endif()
 
 add_custom_target(libc-stdfix-tests)
 
-
 foreach(suffix IN ITEMS hr r lr hk k lk)
   add_libc_test(
     abs${suffix}_test
@@ -84,6 +83,18 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk)
     SRCS
       bits${suffix}_test.cpp
     DEPENDS
+      libc.include.llvm-libc-types.int_k_t
+      libc.include.llvm-libc-types.int_hr_t
+      libc.include.llvm-libc-types.int_lk_t
+      libc.include.llvm-libc-types.int_lr_t
+      libc.include.llvm-libc-types.int_r_t
+      libc.include.llvm-libc-types.uint_uhk_t
+      libc.include.llvm-libc-types.uint_uhr_t
+      libc.include.llvm-libc-types.uint_uk_t
+      libc.include.llvm-libc-types.uint_ulk_t
+      libc.include.llvm-libc-types.uint_ulr_t
+      libc.include.llvm-libc-types.uint_ur_t
+      libc.include.llvm-libc-types.int_hk_t
       libc.src.stdfix.bits${suffix}
       libc.src.__support.fixed_point.fx_rep
       libc.src.__support.fixed_point.fx_bits
diff --git a/libc/test/src/stdfix/FxBitsTest.h b/libc/test/src/stdfix/FxBitsTest.h
index ae04cc1..18dacad 100644
--- a/libc/test/src/stdfix/FxBitsTest.h
+++ b/libc/test/src/stdfix/FxBitsTest.h
@@ -8,7 +8,6 @@
 
 #include "test/UnitTest/Test.h"
 
-#include "include/llvm-libc-types/stdfix-types.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/fixed_point/fx_bits.h"
 
diff --git a/libc/test/src/stdfix/bitshk_test.cpp b/libc/test/src/stdfix/bitshk_test.cpp
index ca83162d..85dbf0a8 100644
--- a/libc/test/src/stdfix/bitshk_test.cpp
+++ b/libc/test/src/stdfix/bitshk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_hk_t
+#include "llvm-libc-types/int_hk_t.h" // int_hk_t
 #include "src/stdfix/bitshk.h"
 
 LIST_BITSFX_TESTS(hk, short accum, int_hk_t, LIBC_NAMESPACE::bitshk);
diff --git a/libc/test/src/stdfix/bitshr_test.cpp b/libc/test/src/stdfix/bitshr_test.cpp
index 220d7f6..d9583ad2 100644
--- a/libc/test/src/stdfix/bitshr_test.cpp
+++ b/libc/test/src/stdfix/bitshr_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_hr_t
+#include "llvm-libc-types/int_hr_t.h" // int_hr_t
 #include "src/stdfix/bitshr.h"
 
 LIST_BITSFX_TESTS(hr, short fract, int_hr_t, LIBC_NAMESPACE::bitshr);
diff --git a/libc/test/src/stdfix/bitsk_test.cpp b/libc/test/src/stdfix/bitsk_test.cpp
index 7e0057b..8d38ab5 100644
--- a/libc/test/src/stdfix/bitsk_test.cpp
+++ b/libc/test/src/stdfix/bitsk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_k_t
+#include "llvm-libc-types/int_k_t.h" // int_k_t
 #include "src/stdfix/bitsk.h"
 
 LIST_BITSFX_TESTS(k, accum, int_k_t, LIBC_NAMESPACE::bitsk);
diff --git a/libc/test/src/stdfix/bitslk_test.cpp b/libc/test/src/stdfix/bitslk_test.cpp
index 46c04e2..fc6fcbb 100644
--- a/libc/test/src/stdfix/bitslk_test.cpp
+++ b/libc/test/src/stdfix/bitslk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_lk_t
+#include "llvm-libc-types/int_lk_t.h" // int_lk_t
 #include "src/stdfix/bitslk.h"
 
 LIST_BITSFX_TESTS(lk, long accum, int_lk_t, LIBC_NAMESPACE::bitslk);
diff --git a/libc/test/src/stdfix/bitslr_test.cpp b/libc/test/src/stdfix/bitslr_test.cpp
index ef68d28..f88580c 100644
--- a/libc/test/src/stdfix/bitslr_test.cpp
+++ b/libc/test/src/stdfix/bitslr_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_lr_t
+#include "llvm-libc-types/int_lr_t.h" // int_lr_t
 #include "src/stdfix/bitslr.h"
 
 LIST_BITSFX_TESTS(hk, long fract, int_lr_t, LIBC_NAMESPACE::bitslr);
diff --git a/libc/test/src/stdfix/bitsr_test.cpp b/libc/test/src/stdfix/bitsr_test.cpp
index 0aeb980..e987a7f 100644
--- a/libc/test/src/stdfix/bitsr_test.cpp
+++ b/libc/test/src/stdfix/bitsr_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_r_t
+#include "llvm-libc-types/int_r_t.h" // int_r_t
 #include "src/stdfix/bitsr.h"
 
 LIST_BITSFX_TESTS(r, fract, int_r_t, LIBC_NAMESPACE::bitsr);
diff --git a/libc/test/src/stdfix/bitsuhk_test.cpp b/libc/test/src/stdfix/bitsuhk_test.cpp
index 5ddb783..5b09453 100644
--- a/libc/test/src/stdfix/bitsuhk_test.cpp
+++ b/libc/test/src/stdfix/bitsuhk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_uhk_t
+#include "llvm-libc-types/uint_uhk_t.h" // uint_uhk_t
 #include "src/stdfix/bitsuhk.h"
 
 LIST_BITSFX_TESTS(uhk, unsigned short accum, uint_uhk_t,
diff --git a/libc/test/src/stdfix/bitsuhr_test.cpp b/libc/test/src/stdfix/bitsuhr_test.cpp
index 6f5d559..0d006a3 100644
--- a/libc/test/src/stdfix/bitsuhr_test.cpp
+++ b/libc/test/src/stdfix/bitsuhr_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_uhr_t
+#include "llvm-libc-types/uint_uhr_t.h" // uint_uhr_t
 #include "src/stdfix/bitsuhr.h"
 
 LIST_BITSFX_TESTS(uhr, unsigned short fract, uint_uhr_t,
diff --git a/libc/test/src/stdfix/bitsuk_test.cpp b/libc/test/src/stdfix/bitsuk_test.cpp
index 309c525..06a8a4e 100644
--- a/libc/test/src/stdfix/bitsuk_test.cpp
+++ b/libc/test/src/stdfix/bitsuk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_uk_t
+#include "llvm-libc-types/uint_uk_t.h" // uint_uk_t
 #include "src/stdfix/bitsuk.h"
 
 LIST_BITSFX_TESTS(uk, unsigned accum, uint_uk_t, LIBC_NAMESPACE::bitsuk);
diff --git a/libc/test/src/stdfix/bitsulk_test.cpp b/libc/test/src/stdfix/bitsulk_test.cpp
index cba011d..9d56a18 100644
--- a/libc/test/src/stdfix/bitsulk_test.cpp
+++ b/libc/test/src/stdfix/bitsulk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_ulk_t
+#include "llvm-libc-types/uint_ulk_t.h" // uint_ulk_t
 #include "src/stdfix/bitsulk.h"
 
 LIST_BITSFX_TESTS(ulk, unsigned long accum, uint_ulk_t,
diff --git a/libc/test/src/stdfix/bitsulr_test.cpp b/libc/test/src/stdfix/bitsulr_test.cpp
index 39b21c4..3ebbf8a 100644
--- a/libc/test/src/stdfix/bitsulr_test.cpp
+++ b/libc/test/src/stdfix/bitsulr_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_ulr_t
+#include "llvm-libc-types/uint_ulr_t.h" // uint_ulr_t
 #include "src/stdfix/bitsulr.h"
 
 LIST_BITSFX_TESTS(ulr, unsigned long fract, uint_ulr_t,
diff --git a/libc/test/src/stdfix/bitsur_test.cpp b/libc/test/src/stdfix/bitsur_test.cpp
index b7c4b06..4f36807 100644
--- a/libc/test/src/stdfix/bitsur_test.cpp
+++ b/libc/test/src/stdfix/bitsur_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_ur_t
+#include "llvm-libc-types/uint_ur_t.h" // uint_ur_t
 #include "src/stdfix/bitsur.h"
 
 LIST_BITSFX_TESTS(ur, unsigned fract, uint_ur_t, LIBC_NAMESPACE::bitsur);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index c0e426c..a458cbd 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -497,6 +497,7 @@ public:
   LLVM_ABI LegalizeResult lowerMinMax(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFCopySign(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
+  LLVM_ABI LegalizeResult lowerFMinimumMaximum(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerIntrinsicRound(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFFloor(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a70c9c0..d6ed3a8 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3576,6 +3576,10 @@ public:
     return nullptr;
   }
 
+  const RTLIB::RuntimeLibcallsInfo &getRuntimeLibcallsInfo() const {
+    return Libcalls;
+  }
+
   void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) {
     Libcalls.setLibcallImpl(Call, Impl);
   }
@@ -3813,10 +3817,6 @@ private:
   /// The list of libcalls that the target will use.
   RTLIB::RuntimeLibcallsInfo Libcalls;
 
-  /// The ISD::CondCode that should be used to test the result of each of the
-  /// comparison libcall against zero.
-  ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL];
-
   /// The bits of IndexedModeActions used to store the legalisation actions
   /// We store the data as   | ML | MS |  L |  S | each taking 4 bits.
   enum IndexedModeActionsBits {
diff --git a/llvm/include/llvm/IR/IntrinsicsHexagon.td b/llvm/include/llvm/IR/IntrinsicsHexagon.td
index 67b873d..20ba51a 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagon.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagon.td
@@ -447,3 +447,15 @@ def int_hexagon_instrprof_custom
 
 
 include "llvm/IR/IntrinsicsHexagonDep.td"
+
+class Hexagon__ptri32i32v64i16_Intrinsic<string GCCIntSuffix,
+      list<IntrinsicProperty> intr_properties = [IntrNoMem]>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [], [llvm_ptr_ty,llvm_i32_ty,llvm_i32_ty,llvm_v64i16_ty],
+       intr_properties>;
+
+def int_hexagon_V6_vgather_vscattermh :
+Hexagon__ptri32i32v64i16_Intrinsic<"HEXAGON_V6_vgather_vscattermh", [IntrArgMemOnly]>;
+
+def int_hexagon_V6_vgather_vscattermh_128B :
+Hexagon__ptri32i32v32i32_Intrinsic<"HEXAGON_V6_vgather_vscattermh_128B", [IntrArgMemOnly]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
index 0cd6008..fe95377 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
@@ -6832,3 +6832,17 @@ Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsub_hf_f8">;
 
 def int_hexagon_V6_vsub_hf_f8_128B :
 Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsub_hf_f8_128B">;
+
+// V81 HVX Instructions.
+
+def int_hexagon_V6_vsub_hf_mix :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsub_hf_mix">;
+
+def int_hexagon_V6_vsub_hf_mix_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsub_hf_mix_128B">;
+
+def int_hexagon_V6_vsub_sf_mix :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsub_sf_mix">;
+
+def int_hexagon_V6_vsub_sf_mix_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsub_sf_mix_128B">;
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td b/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td
index 4a0272c..b1bb24e 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td
@@ -162,6 +162,10 @@ let TargetPrefix = "riscv" in {
   defm "" : RISCVSFCustomVC_XVV<["x", "i", "v", "f"]>;
   defm "" : RISCVSFCustomVC_XVW<["x", "i", "v", "f"]>;
 
+  // XSfvfexp* and XSfvfexpa*
+  defm sf_vfexp : RISCVUnaryAA;
+  defm sf_vfexpa : RISCVUnaryAA;
+
   // XSfvqmaccdod
   def int_riscv_sf_vqmaccu_2x8x2  : RISCVSFCustomVMACC;
   def int_riscv_sf_vqmacc_2x8x2   : RISCVSFCustomVMACC;
diff --git a/llvm/include/llvm/Support/Timer.h b/llvm/include/llvm/Support/Timer.h
index 6a44758..527d67f 100644
--- a/llvm/include/llvm/Support/Timer.h
+++ b/llvm/include/llvm/Support/Timer.h
@@ -209,6 +209,7 @@ class TimerGroup {
   std::string Description;
   Timer *FirstTimer = nullptr; ///< First timer in the group.
   std::vector<PrintRecord> TimersToPrint;
+  bool PrintOnExit;
 
   TimerGroup **Prev; ///< Pointer to Next field of previous timergroup in list.
   TimerGroup *Next;  ///< Pointer to next timergroup in list.
@@ -217,13 +218,15 @@ class TimerGroup {
 
   friend class TimerGlobals;
   explicit TimerGroup(StringRef Name, StringRef Description,
-                      sys::SmartMutex<true> &lock);
+                      sys::SmartMutex<true> &lock, bool PrintOnExit);
 
 public:
-  LLVM_ABI explicit TimerGroup(StringRef Name, StringRef Description);
+  LLVM_ABI explicit TimerGroup(StringRef Name, StringRef Description,
+                               bool PrintOnExit = true);
 
   LLVM_ABI explicit TimerGroup(StringRef Name, StringRef Description,
-                               const StringMap<TimeRecord> &Records);
+                               const StringMap<TimeRecord> &Records,
+                               bool PrintOnExit = true);
 
   LLVM_ABI ~TimerGroup();
 
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 853bd66..a572eef 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -1582,6 +1582,23 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
   return nullptr;
 }
 
+/// Returns the absolute value of \p A. In the context of dependence analysis,
+/// we need an absolute value in a mathematical sense. If \p A is the signed
+/// minimum value, we cannot represent it unless extending the original type.
+/// Thus if we cannot prove that \p A is not the signed minimum value, returns
+/// nullptr.
+static const SCEV *absSCEVNoSignedOverflow(const SCEV *A, ScalarEvolution &SE) {
+  IntegerType *Ty = cast<IntegerType>(A->getType());
+  if (!Ty)
+    return nullptr;
+
+  const SCEV *SMin =
+      SE.getConstant(APInt::getSignedMinValue(Ty->getBitWidth()));
+  if (!SE.isKnownPredicate(CmpInst::ICMP_NE, A, SMin))
+    return nullptr;
+  return SE.getAbsExpr(A, /*IsNSW=*/true);
+}
+
 /// Returns true iff \p Test is enabled.
 static bool isDependenceTestEnabled(DependenceTestType Test) {
   if (EnableDependenceTest == DependenceTestType::All)
@@ -1669,21 +1686,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n");
 
   // check that |Delta| < iteration count
-  if (const SCEV *UpperBound =
-          collectUpperBound(CurSrcLoop, Delta->getType())) {
+  bool IsDeltaLarge = [&] {
+    const SCEV *UpperBound = collectUpperBound(CurSrcLoop, Delta->getType());
+    if (!UpperBound)
+      return false;
+
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);
     LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
-    const SCEV *AbsDelta =
-        SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
-    const SCEV *AbsCoeff =
-        SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
+    const SCEV *AbsDelta = absSCEVNoSignedOverflow(Delta, *SE);
+    const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE);
+    if (!AbsDelta || !AbsCoeff)
+      return false;
     const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
-    if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) {
-      // Distance greater than trip count - no dependence
-      ++StrongSIVindependence;
-      ++StrongSIVsuccesses;
-      return true;
-    }
+    return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product);
+  }();
+  if (IsDeltaLarge) {
+    // Distance greater than trip count - no dependence
+    ++StrongSIVindependence;
+    ++StrongSIVsuccesses;
+    return true;
   }
 
   // Can we compute distance?
@@ -2259,6 +2280,9 @@ bool DependenceInfo::weakZeroSrcSIVtest(
   const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
   if (!ConstCoeff)
     return false;
+
+  // Since ConstCoeff is constant, !isKnownNegative means it's non-negative.
+  // TODO: Bail out if it's a signed minimum value.
   const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
                              ? SE->getNegativeSCEV(ConstCoeff)
                              : ConstCoeff;
@@ -2369,6 +2393,9 @@ bool DependenceInfo::weakZeroDstSIVtest(
   const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
   if (!ConstCoeff)
     return false;
+
+  // Since ConstCoeff is constant, !isKnownNegative means it's non-negative.
+  // TODO: Bail out if it's a signed minimum value.
   const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
                              ? SE->getNegativeSCEV(ConstCoeff)
                              : ConstCoeff;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index f9d27b0..178529f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4748,6 +4748,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
   case G_FMINIMUMNUM:
   case G_FMAXIMUMNUM:
     return lowerFMinNumMaxNum(MI);
+  case G_FMINIMUM:
+  case G_FMAXIMUM:
+    return lowerFMinimumMaximum(MI);
   case G_MERGE_VALUES:
     return lowerMergeValues(MI);
   case G_UNMERGE_VALUES:
@@ -8777,6 +8780,77 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFMinimumMaximum(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+  LLT Ty = MRI.getType(Dst);
+  LLT CmpTy = Ty.changeElementSize(1);
+
+  bool IsMax = (Opc == TargetOpcode::G_FMAXIMUM);
+  unsigned OpcIeee =
+      IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE;
+  unsigned OpcNonIeee =
+      IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM;
+  bool MinMaxMustRespectOrderedZero = false;
+  Register Res;
+
+  // IEEE variants don't need canonicalization
+  if (LI.isLegalOrCustom({OpcIeee, Ty})) {
+    Res = MIRBuilder.buildInstr(OpcIeee, {Ty}, {Src0, Src1}).getReg(0);
+    MinMaxMustRespectOrderedZero = true;
+  } else if (LI.isLegalOrCustom({OpcNonIeee, Ty})) {
+    Res = MIRBuilder.buildInstr(OpcNonIeee, {Ty}, {Src0, Src1}).getReg(0);
+  } else {
+    auto Compare = MIRBuilder.buildFCmp(
+        IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT, CmpTy, Src0, Src1);
+    Res = MIRBuilder.buildSelect(Ty, Compare, Src0, Src1).getReg(0);
+  }
+
+  // Propagate any NaN of both operands
+  if (!MI.getFlag(MachineInstr::FmNoNans) &&
+      (!isKnownNeverNaN(Src0, MRI) || isKnownNeverNaN(Src1, MRI))) {
+    auto IsOrdered = MIRBuilder.buildFCmp(CmpInst::FCMP_ORD, CmpTy, Src0, Src1);
+
+    LLT ElementTy = Ty.isScalar() ? Ty : Ty.getElementType();
+    APFloat NaNValue = APFloat::getNaN(getFltSemanticForLLT(ElementTy));
+    Register NaN = MIRBuilder.buildFConstant(ElementTy, NaNValue).getReg(0);
+    if (Ty.isVector())
+      NaN = MIRBuilder.buildSplatBuildVector(Ty, NaN).getReg(0);
+
+    Res = MIRBuilder.buildSelect(Ty, IsOrdered, Res, NaN).getReg(0);
+  }
+
+  // fminimum/fmaximum requires -0.0 less than +0.0
+  if (!MinMaxMustRespectOrderedZero && !MI.getFlag(MachineInstr::FmNsz)) {
+    GISelValueTracking VT(MIRBuilder.getMF());
+    KnownFPClass Src0Info = VT.computeKnownFPClass(Src0, fcZero);
+    KnownFPClass Src1Info = VT.computeKnownFPClass(Src1, fcZero);
+
+    if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) {
+      const unsigned Flags = MI.getFlags();
+      Register Zero = MIRBuilder.buildFConstant(Ty, 0.0).getReg(0);
+      auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_OEQ, CmpTy, Res, Zero);
+
+      unsigned TestClass = IsMax ? fcPosZero : fcNegZero;
+
+      auto LHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src0, TestClass);
+      auto LHSSelect =
+          MIRBuilder.buildSelect(Ty, LHSTestZero, Src0, Res, Flags);
+
+      auto RHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src1, TestClass);
+      auto RHSSelect =
+          MIRBuilder.buildSelect(Ty, RHSTestZero, Src1, LHSSelect, Flags);
+
+      Res = MIRBuilder.buildSelect(Ty, IsZero, RHSSelect, Res, Flags).getReg(0);
+    }
+  }
+
+  MIRBuilder.buildCopy(Dst, Res);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
   Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 488b078..1096e57 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -4082,10 +4082,10 @@ void AssemblyWriter::printTypeIdentities() {
 
 /// printFunction - Print all aspects of a function.
 void AssemblyWriter::printFunction(const Function *F) {
-  if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
-
   if (F->isMaterializable())
     Out << "; Materializable\n";
+  else if (AnnotationWriter)
+    AnnotationWriter->emitFunctionAnnot(F, Out);
 
   const AttributeList &Attrs = F->getAttributes();
   if (Attrs.hasFnAttrs()) {
diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt
index 1e1d0a6..70c4577 100644
--- a/llvm/lib/MC/CMakeLists.txt
+++ b/llvm/lib/MC/CMakeLists.txt
@@ -73,9 +73,10 @@ add_llvm_component_library(LLVMMC
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/MC
 
   LINK_COMPONENTS
+  BinaryFormat
+  DebugInfoDWARFLowLevel
   Support
   TargetParser
-  BinaryFormat
 
   DEPENDS
   intrinsics_gen
diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp
index d6fa54c..e0a90df 100644
--- a/llvm/lib/MC/MCSFrame.cpp
+++ b/llvm/lib/MC/MCSFrame.cpp
@@ -8,6 +8,8 @@
 
 #include "llvm/MC/MCSFrame.h"
 #include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h"
+#include "llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
@@ -211,8 +213,152 @@ class SFrameEmitterImpl {
     return true;
   }
 
+  // Technically, the escape data could be anything, but it is commonly a dwarf
+  // CFI program. Even then, it could contain an arbitrarily complicated Dwarf
+  // expression. Following gnu-gas, look for certain common cases that could
+  // invalidate an FDE, emit a warning for those sequences, and don't generate
+  // an FDE in those cases. Allow any that are known safe. It is likely that
+  // more thorough test cases could refine this code, but it handles the most
+  // important ones compatibly with gas.
+  // Returns true if the CFI escape sequence is safe for sframes.
+  bool isCFIEscapeSafe(SFrameFDE &FDE, const SFrameFRE &FRE,
+                       const MCCFIInstruction &CFI) {
+    const MCAsmInfo *AI = Streamer.getContext().getAsmInfo();
+    DWARFDataExtractorSimple data(CFI.getValues(), AI->isLittleEndian(),
+                                  AI->getCodePointerSize());
+
+    // Normally, both alignment factors are extracted from the enclosing Dwarf
+    // FDE or CIE. We don't have one here. Alignments are used for scaling
+    // factors for ops like CFA_def_cfa_offset_sf. But this particular function
+    // is only interested in registers.
+    dwarf::CFIProgram P(/*CodeAlignmentFactor=*/1,
+                        /*DataAlignmentFactor=*/1,
+                        Streamer.getContext().getTargetTriple().getArch());
+    uint64_t Offset = 0;
+    if (P.parse(data, &Offset, CFI.getValues().size())) {
+      // Not a parsable dwarf expression. Assume the worst.
+      Streamer.getContext().reportWarning(
+          CFI.getLoc(),
+          "skipping SFrame FDE; .cfi_escape with unknown effects");
+      return false;
+    }
+
+    // This loop deals with dwarf::CFIProgram::Instructions. Everywhere else
+    // this file deals with MCCFIInstructions.
+    for (const dwarf::CFIProgram::Instruction &I : P) {
+      switch (I.Opcode) {
+      case dwarf::DW_CFA_nop:
+        break;
+      case dwarf::DW_CFA_val_offset: {
+        // First argument is a register. Anything that touches CFA, FP, or RA is
+        // a problem, but allow others through. As an even more special case,
+        // allow SP + 0.
+        auto Reg = I.getOperandAsUnsigned(P, 0);
+        // The parser should have failed in this case.
+        assert(Reg && "DW_CFA_val_offset with no register.");
+        bool SPOk = true;
+        if (*Reg == SPReg) {
+          auto Opnd = I.getOperandAsSigned(P, 1);
+          if (!Opnd || *Opnd != 0)
+            SPOk = false;
+        }
+        if (!SPOk || *Reg == RAReg || *Reg == FPReg) {
+          StringRef RN = *Reg == SPReg
+                             ? "SP reg "
+                             : (*Reg == FPReg ? "FP reg " : "RA reg ");
+          Streamer.getContext().reportWarning(
+              CFI.getLoc(),
+              Twine(
+                  "skipping SFrame FDE; .cfi_escape DW_CFA_val_offset with ") +
+                  RN + Twine(*Reg));
+          return false;
+        }
+      } break;
+      case dwarf::DW_CFA_expression: {
+        // First argument is a register. Anything that touches CFA, FP, or RA is
+        // a problem, but allow others through.
+        auto Reg = I.getOperandAsUnsigned(P, 0);
+        if (!Reg) {
+          Streamer.getContext().reportWarning(
+              CFI.getLoc(),
+              "skipping SFrame FDE; .cfi_escape with unknown effects");
+          return false;
+        }
+        if (*Reg == SPReg || *Reg == RAReg || *Reg == FPReg) {
+          StringRef RN = *Reg == SPReg
+                             ? "SP reg "
+                             : (*Reg == FPReg ? "FP reg " : "RA reg ");
+          Streamer.getContext().reportWarning(
+              CFI.getLoc(),
+              Twine(
+                  "skipping SFrame FDE; .cfi_escape DW_CFA_expression with ") +
+                  RN + Twine(*Reg));
+          return false;
+        }
+      } break;
+      case dwarf::DW_CFA_GNU_args_size: {
+        auto Size = I.getOperandAsSigned(P, 0);
+        // Zero size doesn't affect the cfa.
+        if (Size && *Size == 0)
+          break;
+        if (FRE.Info.getBaseRegister() != BaseReg::FP) {
+          Streamer.getContext().reportWarning(
+              CFI.getLoc(),
+              Twine("skipping SFrame FDE; .cfi_escape DW_CFA_GNU_args_size "
+                    "with non frame-pointer CFA"));
+          return false;
+        }
+      } break;
+      // Cases that gas doesn't specially handle. TODO: Some of these could be
+      // analyzed and handled instead of just punting. But these are uncommon,
+      // or should be written as normal cfi directives. Some will need fixes to
+      // the scaling factor.
+      case dwarf::DW_CFA_advance_loc:
+      case dwarf::DW_CFA_offset:
+      case dwarf::DW_CFA_restore:
+      case dwarf::DW_CFA_set_loc:
+      case dwarf::DW_CFA_advance_loc1:
+      case dwarf::DW_CFA_advance_loc2:
+      case dwarf::DW_CFA_advance_loc4:
+      case dwarf::DW_CFA_offset_extended:
+      case dwarf::DW_CFA_restore_extended:
+      case dwarf::DW_CFA_undefined:
+      case dwarf::DW_CFA_same_value:
+      case dwarf::DW_CFA_register:
+      case dwarf::DW_CFA_remember_state:
+      case dwarf::DW_CFA_restore_state:
+      case dwarf::DW_CFA_def_cfa:
+      case dwarf::DW_CFA_def_cfa_register:
+      case dwarf::DW_CFA_def_cfa_offset:
+      case dwarf::DW_CFA_def_cfa_expression:
+      case dwarf::DW_CFA_offset_extended_sf:
+      case dwarf::DW_CFA_def_cfa_sf:
+      case dwarf::DW_CFA_def_cfa_offset_sf:
+      case dwarf::DW_CFA_val_offset_sf:
+      case dwarf::DW_CFA_val_expression:
+      case dwarf::DW_CFA_MIPS_advance_loc8:
+      case dwarf::DW_CFA_AARCH64_negate_ra_state_with_pc:
+      case dwarf::DW_CFA_AARCH64_negate_ra_state:
+      case dwarf::DW_CFA_LLVM_def_aspace_cfa:
+      case dwarf::DW_CFA_LLVM_def_aspace_cfa_sf:
+        Streamer.getContext().reportWarning(
+            CFI.getLoc(), "skipping SFrame FDE; .cfi_escape "
+                          "CFA expression with unknown side effects");
+        return false;
+      default:
+        // Dwarf expression was only partially valid, and user could have
+        // written anything.
+        Streamer.getContext().reportWarning(
+            CFI.getLoc(),
+            "skipping SFrame FDE; .cfi_escape with unknown effects");
+        return false;
+      }
+    }
+    return true;
+  }
+
   // Add the effects of CFI to the current FDE, creating a new FRE when
-  // necessary.
+  // necessary. Return true if the CFI is representable in the sframe format.
   bool handleCFI(SFrameFDE &FDE, SFrameFRE &FRE, const MCCFIInstruction &CFI) {
     switch (CFI.getOperation()) {
     case MCCFIInstruction::OpDefCfaRegister:
@@ -265,10 +411,11 @@ class SFrameEmitterImpl {
       FRE = FDE.SaveState.pop_back_val();
       return true;
     case MCCFIInstruction::OpEscape:
-      // TODO: Implement. Will use FDE.
-      return true;
+      // This is a string of bytes that contains an arbitrary dwarf-expression
+      // that may or may not affect unwind info.
+      return isCFIEscapeSafe(FDE, FRE, CFI);
     default:
-      // Instructions that don't affect the CFA, RA, and SP can be safely
+      // Instructions that don't affect the CFA, RA, and FP can be safely
       // ignored.
       return true;
     }
diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp
index 67483ba..9d45096 100644
--- a/llvm/lib/Support/Timer.cpp
+++ b/llvm/lib/Support/Timer.cpp
@@ -240,7 +240,8 @@ private:
   getGroupEntry(StringRef GroupName, StringRef GroupDescription) {
     std::pair<TimerGroup *, Name2TimerMap> &GroupEntry = Map[GroupName];
     if (!GroupEntry.first)
-      GroupEntry.first = new TimerGroup(GroupName, GroupDescription);
+      GroupEntry.first =
+          new TimerGroup(GroupName, GroupDescription, /*PrintOnExit=*/true);
 
     return GroupEntry;
   }
@@ -270,9 +271,10 @@ TimerGroup &NamedRegionTimer::getNamedTimerGroup(StringRef GroupName,
 static TimerGroup *TimerGroupList = nullptr;
 
 TimerGroup::TimerGroup(StringRef Name, StringRef Description,
-                       sys::SmartMutex<true> &lock)
+                       sys::SmartMutex<true> &lock, bool PrintOnExit)
     : Name(Name.begin(), Name.end()),
-      Description(Description.begin(), Description.end()) {
+      Description(Description.begin(), Description.end()),
+      PrintOnExit(PrintOnExit) {
   // Add the group to TimerGroupList.
   sys::SmartScopedLock<true> L(lock);
   if (TimerGroupList)
@@ -282,12 +284,12 @@ TimerGroup::TimerGroup(StringRef Name, StringRef Description,
   TimerGroupList = this;
 }
 
-TimerGroup::TimerGroup(StringRef Name, StringRef Description)
-    : TimerGroup(Name, Description, timerLock()) {}
+TimerGroup::TimerGroup(StringRef Name, StringRef Description, bool PrintOnExit)
+    : TimerGroup(Name, Description, timerLock(), PrintOnExit) {}
 
 TimerGroup::TimerGroup(StringRef Name, StringRef Description,
-                       const StringMap<TimeRecord> &Records)
-    : TimerGroup(Name, Description) {
+                       const StringMap<TimeRecord> &Records, bool PrintOnExit)
+    : TimerGroup(Name, Description, PrintOnExit) {
   TimersToPrint.reserve(Records.size());
   for (const auto &P : Records)
     TimersToPrint.emplace_back(P.getValue(), std::string(P.getKey()),
@@ -301,7 +303,7 @@ TimerGroup::~TimerGroup() {
   while (FirstTimer)
     removeTimer(*FirstTimer);
 
-  if (!TimersToPrint.empty()) {
+  if (!TimersToPrint.empty() && PrintOnExit) {
     std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile();
     PrintQueuedTimers(*OutStream);
   }
@@ -530,7 +532,7 @@ public:
 
   sys::SmartMutex<true> TimerLock;
   TimerGroup DefaultTimerGroup{"misc", "Miscellaneous Ungrouped Timers",
-                               TimerLock};
+                               TimerLock, /*PrintOnExit=*/true};
   SignpostEmitter Signposts;
 
   // Order of these members and initialization below is important. For example
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 018ef31..d16b116 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9002,12 +9002,12 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
 }
 
 static SMECallAttrs
-getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI,
+getSMECallAttrs(const Function &Caller, const RTLIB::RuntimeLibcallsInfo &RTLCI,
                 const TargetLowering::CallLoweringInfo &CLI) {
   if (CLI.CB)
-    return SMECallAttrs(*CLI.CB, &TLI);
+    return SMECallAttrs(*CLI.CB, &RTLCI);
   if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
-    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
+    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));
   return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal));
 }
 
@@ -9029,7 +9029,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
 
   // SME Streaming functions are not eligible for TCO as they may require
   // the streaming mode or ZA to be restored after returning from the call.
-  SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
+  SMECallAttrs CallAttrs =
+      getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
   if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
       CallAttrs.requiresPreservingAllZAState() ||
       CallAttrs.caller().hasStreamingBody())
@@ -9454,7 +9455,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   // Determine whether we need any streaming mode changes.
-  SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
+  SMECallAttrs CallAttrs =
+      getSMECallAttrs(MF.getFunction(), getRuntimeLibcallsInfo(), CLI);
 
   std::optional<unsigned> ZAMarkerNode;
   bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
@@ -26723,11 +26725,34 @@ static SDValue performDUPCombine(SDNode *N,
   }
 
   if (N->getOpcode() == AArch64ISD::DUP) {
+    SDValue Op = N->getOperand(0);
+
+    // Optimize DUP(extload/zextload i8/i16/i32) to avoid GPR->FPR transfer.
+    // For example:
+    //   v4i32 = DUP (i32 (zextloadi8 addr))
+    // =>
+    //   v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0
+    //   v4i32 = DUPLANE32 (v4i32), 0
+    if (auto *LD = dyn_cast<LoadSDNode>(Op)) {
+      ISD::LoadExtType ExtType = LD->getExtensionType();
+      EVT MemVT = LD->getMemoryVT();
+      EVT ElemVT = VT.getVectorElementType();
+      if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
+          (MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) &&
+          ElemVT != MemVT && LD->hasOneUse()) {
+        EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
+                                        128 / ElemVT.getSizeInBits());
+        SDValue ScalarToVec =
+            DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op);
+        return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec,
+                               DCI.DAG.getConstant(0, DL, MVT::i64));
+      }
+    }
+
     // If the instruction is known to produce a scalar in SIMD registers, we can
     // duplicate it across the vector lanes using DUPLANE instead of moving it
     // to a GPR first. For example, this allows us to handle:
     //   v4i32 = DUP (i32 (FCMGT (f32, f32)))
-    SDValue Op = N->getOperand(0);
     // FIXME: Ideally, we should be able to handle all instructions that
     // produce a scalar value in FPRs.
     if (Op.getOpcode() == AArch64ISD::FCMEQ ||
@@ -29496,11 +29521,6 @@ AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
   if (Subtarget->isTargetAndroid())
     return UseTlsOffset(IRB, 0x48);
 
-  // Fuchsia is similar.
-  // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
-  if (Subtarget->isTargetFuchsia())
-    return UseTlsOffset(IRB, -0x8);
-
   return TargetLowering::getSafeStackPointerLocation(IRB);
 }
 
@@ -29818,7 +29838,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
 
   // Checks to allow the use of SME instructions
   if (auto *Base = dyn_cast<CallBase>(&Inst)) {
-    auto CallAttrs = SMECallAttrs(*Base, this);
+    auto CallAttrs = SMECallAttrs(*Base, &getRuntimeLibcallsInfo());
     if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
         CallAttrs.requiresPreservingZT0() ||
         CallAttrs.requiresPreservingAllZAState())
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index a352096..b9e299e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4022,22 +4022,6 @@ defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
 def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
       (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
 
-// load zero-extended i32, bitcast to f64
-def : Pat<(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
-          (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-// load zero-extended i16, bitcast to f64
-def : Pat<(f64 (bitconvert (i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
-          (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-// load zero-extended i8, bitcast to f64
-def : Pat<(f64 (bitconvert (i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
-          (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-// load zero-extended i16, bitcast to f32
-def : Pat<(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
-          (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-// load zero-extended i8, bitcast to f32
-def : Pat<(f32 (bitconvert (i32 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
-          (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-
 // Pre-fetch.
 def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
                         [(AArch64Prefetch timm:$Rt,
@@ -4389,6 +4373,64 @@ def : Pat <(v1i64 (scalar_to_vector (i64
                (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
            (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
 
+// Patterns for bitconvert or scalar_to_vector of load operations.
+// Enables direct SIMD register loads for small integer types (i8/i16) that are
+// naturally zero-extended to i32/i64.
+multiclass ExtLoad8_16AllModes<ValueType OutTy, ValueType InnerTy,
+                                SDPatternOperator OuterOp,
+                                PatFrags LoadOp8, PatFrags LoadOp16> {
+  // 8-bit loads.
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDURBi GPR64sp:$Rn, simm9:$offset), bsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend), bsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend), bsub)>;
+
+  // 16-bit loads.
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDURHi GPR64sp:$Rn, simm9:$offset), hsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend), hsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend), hsub)>;
+}
+
+// Extended multiclass that includes 32-bit loads in addition to 8-bit and 16-bit.
+multiclass ExtLoad8_16_32AllModes<ValueType OutTy, ValueType InnerTy,
+                                   SDPatternOperator OuterOp,
+                                   PatFrags LoadOp8, PatFrags LoadOp16, PatFrags LoadOp32> {
+  defm : ExtLoad8_16AllModes<OutTy, InnerTy, OuterOp, LoadOp8, LoadOp16>;
+
+  // 32-bit loads.
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDURSi GPR64sp:$Rn, simm9:$offset), ssub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend), ssub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend), ssub)>;
+}
+
+// Instantiate bitconvert patterns for floating-point types.
+defm : ExtLoad8_16AllModes<f32, i32, bitconvert, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16_32AllModes<f64, i64, bitconvert, zextloadi8, zextloadi16, zextloadi32>;
+
+// Instantiate scalar_to_vector patterns for all vector types.
+defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, extloadi8, extloadi16>;
+defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, extloadi8, extloadi16>;
+defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, extloadi8, extloadi16>;
+defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, zextloadi8, zextloadi16, zextloadi32>;
+defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, extloadi8, extloadi16, extloadi32>;
+
 // Pre-fetch.
 defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
                   [(AArch64Prefetch timm:$Rt,
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index bdde8e3..2387f17 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -2762,11 +2762,11 @@ def : InstRW<[V2Write_11c_18L01_18V01], (instregex "^ST4[BHWD]_IMM$")>;
 def : InstRW<[V2Write_11c_18L01_18S_18V01], (instregex "^ST4[BHWD]$")>;
 
 // Non temporal store, scalar + imm
-def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$")>;
+def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BHWD]_ZRI$")>;
 
 // Non temporal store, scalar + scalar
-def : InstRW<[V2Write_2c_1L01_1S_1V], (instrs STNT1H_ZRR)>;
-def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BWD]_ZRR$")>;
+def : InstRW<[V2Write_2c_1L01_1S_1V01], (instrs STNT1H_ZRR)>;
+def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BWD]_ZRR$")>;
 
 // Scatter non temporal store, vector + scalar 32-bit element size
 def : InstRW<[V2Write_4c_4L01_4V01], (instregex "^STNT1[BHW]_ZZR_S")>;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d50af11..fede586 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -224,7 +224,8 @@ static cl::opt<bool> EnableScalableAutovecInStreamingMode(
 static bool isSMEABIRoutineCall(const CallInst &CI,
                                 const AArch64TargetLowering &TLI) {
   const auto *F = CI.getCalledFunction();
-  return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine();
+  return F &&
+         SMEAttrs(F->getName(), TLI.getRuntimeLibcallsInfo()).isSMEABIRoutine();
 }
 
 /// Returns true if the function has explicit operations that can only be
@@ -355,7 +356,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
   // change only once and avoid inlining of G into F.
 
   SMEAttrs FAttrs(*F);
-  SMECallAttrs CallAttrs(Call, getTLI());
+  SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
 
   if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
     if (F == Call.getCaller()) // (1)
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index d71f728..085c8588 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -75,8 +75,8 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
 }
 
 void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName,
-                                     const AArch64TargetLowering &TLI) {
-  RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName);
+                                     const RTLIB::RuntimeLibcallsInfo &RTLCI) {
+  RTLIB::LibcallImpl Impl = RTLCI.getSupportedLibcallImpl(FuncName);
   if (Impl == RTLIB::Unsupported)
     return;
   unsigned KnownAttrs = SMEAttrs::Normal;
@@ -124,21 +124,22 @@ bool SMECallAttrs::requiresSMChange() const {
   return true;
 }
 
-SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI)
+SMECallAttrs::SMECallAttrs(const CallBase &CB,
+                           const RTLIB::RuntimeLibcallsInfo *RTLCI)
     : CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal),
       Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) {
   if (auto *CalledFunction = CB.getCalledFunction())
-    CalledFn = SMEAttrs(*CalledFunction, TLI);
-
-  // An `invoke` of an agnostic ZA function may not return normally (it may
-  // resume in an exception block). In this case, it acts like a private ZA
-  // callee and may require a ZA save to be set up before it is called.
-  if (isa<InvokeInst>(CB))
-    CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);
+    CalledFn = SMEAttrs(*CalledFunction, RTLCI);
 
   // FIXME: We probably should not allow SME attributes on direct calls but
   // clang duplicates streaming mode attributes at each callsite.
   assert((IsIndirect ||
           ((Callsite.withoutPerCallsiteFlags() | CalledFn) == CalledFn)) &&
          "SME attributes at callsite do not match declaration");
+
+  // An `invoke` of an agnostic ZA function may not return normally (it may
+  // resume in an exception block). In this case, it acts like a private ZA
+  // callee and may require a ZA save to be set up before it is called.
+  if (isa<InvokeInst>(CB))
+    CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);
 }
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index d26e3cd..28c397e 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -12,8 +12,9 @@
 #include "llvm/IR/Function.h"
 
 namespace llvm {
-
-class AArch64TargetLowering;
+namespace RTLIB {
+struct RuntimeLibcallsInfo;
+}
 
 class Function;
 class CallBase;
@@ -52,14 +53,14 @@ public:
 
   SMEAttrs() = default;
   SMEAttrs(unsigned Mask) { set(Mask); }
-  SMEAttrs(const Function &F, const AArch64TargetLowering *TLI = nullptr)
+  SMEAttrs(const Function &F, const RTLIB::RuntimeLibcallsInfo *RTLCI = nullptr)
       : SMEAttrs(F.getAttributes()) {
-    if (TLI)
-      addKnownFunctionAttrs(F.getName(), *TLI);
+    if (RTLCI)
+      addKnownFunctionAttrs(F.getName(), *RTLCI);
   }
   SMEAttrs(const AttributeList &L);
-  SMEAttrs(StringRef FuncName, const AArch64TargetLowering &TLI) {
-    addKnownFunctionAttrs(FuncName, TLI);
+  SMEAttrs(StringRef FuncName, const RTLIB::RuntimeLibcallsInfo &RTLCI) {
+    addKnownFunctionAttrs(FuncName, RTLCI);
   };
 
   void set(unsigned M, bool Enable = true) {
@@ -157,7 +158,7 @@ public:
 
 private:
   void addKnownFunctionAttrs(StringRef FuncName,
-                             const AArch64TargetLowering &TLI);
+                             const RTLIB::RuntimeLibcallsInfo &RTLCI);
   void validate() const;
 };
 
@@ -175,7 +176,7 @@ public:
                SMEAttrs Callsite = SMEAttrs::Normal)
       : CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {}
 
-  SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI);
+  SMECallAttrs(const CallBase &CB, const RTLIB::RuntimeLibcallsInfo *RTLCI);
 
   SMEAttrs &caller() { return CallerFn; }
   SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index e8b211f..7f00ead 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[
   combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask
 ]>;
 
+// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32),
+  [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)),
+   (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>;
+
+def combine_or_s64_s32 : GICombineRule<
+  (defs root:$dst),
+  (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst),
+  (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x),
+         (G_OR $or, $x_lo, $y),
+         (G_MERGE_VALUES $dst, $or, $x_hi))>;
+
 let Predicates = [Has16BitInsts, NotHasMed3_16] in {
 // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
 // saves one instruction compared to the promotion.
@@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner<
   "AMDGPUPreLegalizerCombinerImpl",
   [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
    foldable_fneg, combine_shuffle_vector_to_build_vector,
-   binop_s64_with_s32_mask_combines]> {
+   binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
@@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
   [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
    uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
    rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64,
-   binop_s64_with_s32_mask_combines]> {
+   binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 596a895..1a13b22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -976,9 +976,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
   }
 
+  auto &MinNumMaxNumIeee =
+      getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
+
+  if (ST.hasVOP3PInsts()) {
+    MinNumMaxNumIeee.legalFor(FPTypesPK16)
+        .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+        .clampMaxNumElements(0, S16, 2)
+        .clampScalar(0, S16, S64)
+        .scalarize(0);
+  } else if (ST.has16BitInsts()) {
+    MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
+  } else {
+    MinNumMaxNumIeee.legalFor(FPTypesBase)
+        .clampScalar(0, S32, S64)
+        .scalarize(0);
+  }
+
   auto &MinNumMaxNum = getActionDefinitionsBuilder(
-      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
-       G_FMAXNUM_IEEE});
+      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
 
   if (ST.hasVOP3PInsts()) {
     MinNumMaxNum.customFor(FPTypesPK16)
@@ -2136,9 +2152,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         .legalFor(FPTypesPK16)
         .clampMaxNumElements(0, S16, 2)
         .scalarize(0);
+  } else if (ST.hasVOP3PInsts()) {
+    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+        .lowerFor({V2S16})
+        .clampMaxNumElementsStrict(0, S16, 2)
+        .scalarize(0)
+        .lower();
   } else {
-    // TODO: Implement
-    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+        .scalarize(0)
+        .clampScalar(0, S32, S64)
+        .lower();
   }
 
   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
@@ -2195,8 +2219,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FMINIMUMNUM:
   case TargetOpcode::G_FMAXIMUMNUM:
-  case TargetOpcode::G_FMINNUM_IEEE:
-  case TargetOpcode::G_FMAXNUM_IEEE:
     return legalizeMinNumMaxNum(Helper, MI);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return legalizeExtractVectorElt(MI, MRI, B);
@@ -2817,23 +2839,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
   MachineFunction &MF = Helper.MIRBuilder.getMF();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
-                        MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
-
-  // With ieee_mode disabled, the instructions have the correct behavior
-  // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
-  //
-  // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
-  // enabled.
-  if (!MFI->getMode().IEEE) {
-    if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
-        MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
-      return true;
-
-    return !IsIEEEOp;
-  }
-
-  if (IsIEEEOp)
+  // With ieee_mode disabled, the instructions have the correct behavior.
+  if (!MFI->getMode().IEEE)
     return true;
 
   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index fb0928b8..ede8463 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -79,6 +79,12 @@ def ExtensionHVXV79: SubtargetFeature<"hvxv79", "HexagonHVXVersion",
        ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71,
        ExtensionHVXV73, ExtensionHVXV75]>;
 
+def ExtensionHVXV81: SubtargetFeature<"hvxv81", "HexagonHVXVersion",
+      "Hexagon::ArchEnum::V81", "Hexagon HVX instructions",
+      [ExtensionHVXV65, ExtensionHVXV66, ExtensionHVXV67,
+       ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71,
+       ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79]>;
+
 def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps",
       "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>;
 def ExtensionHVX128B: SubtargetFeature<"hvx-length128b", "UseHVX128BOps",
@@ -151,6 +157,8 @@ def UseHVXV75          : Predicate<"HST->useHVXV75Ops()">,
                          AssemblerPredicate<(all_of ExtensionHVXV75)>;
 def UseHVXV79          : Predicate<"HST->useHVXV79Ops()">,
                          AssemblerPredicate<(all_of ExtensionHVXV79)>;
+def UseHVXV81          : Predicate<"HST->useHVXV81Ops()">,
+                         AssemblerPredicate<(all_of ExtensionHVXV81)>;
 def UseAudio           : Predicate<"HST->useAudioOps()">,
                          AssemblerPredicate<(all_of ExtensionAudio)>;
 def UseZReg            : Predicate<"HST->useZRegOps()">,
@@ -488,6 +496,11 @@ def : Proc<"hexagonv79", HexagonModelV79,
            ArchV68, ArchV69, ArchV71, ArchV73, ArchV75, ArchV79,
            FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+def : Proc<"hexagonv81", HexagonModelV81,
+           [ArchV65, ArchV66, ArchV67, ArchV68, ArchV69, ArchV71, ArchV73,
+            ArchV75, ArchV79, ArchV81,
+            FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
 
 // Need to update the correct features for tiny core.
 // Disable NewValueJumps since the packetizer is unable to handle a packet with
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.h b/llvm/lib/Target/Hexagon/HexagonDepArch.h
index 8984534..9bf4034 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.h
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.h
@@ -29,7 +29,8 @@ enum class ArchEnum {
   V71,
   V73,
   V75,
-  V79
+  V79,
+  V81
 };
 
 inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) {
@@ -50,6 +51,7 @@ inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) {
       .Case("hexagonv73", Hexagon::ArchEnum::V73)
       .Case("hexagonv75", Hexagon::ArchEnum::V75)
       .Case("hexagonv79", Hexagon::ArchEnum::V79)
+      .Case("hexagonv81", Hexagon::ArchEnum::V81)
       .Default(std::nullopt);
 }
 } // namespace Hexagon
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.td b/llvm/lib/Target/Hexagon/HexagonDepArch.td
index 8ec1d93..f623fd0 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.td
@@ -34,3 +34,5 @@ def ArchV75: SubtargetFeature<"v75", "HexagonArchVersion", "Hexagon::ArchEnum::V
 def HasV75 : Predicate<"HST->hasV75Ops()">, AssemblerPredicate<(all_of ArchV75)>;
 def ArchV79: SubtargetFeature<"v79", "HexagonArchVersion", "Hexagon::ArchEnum::V79", "Enable Hexagon V79 architecture">;
 def HasV79 : Predicate<"HST->hasV79Ops()">, AssemblerPredicate<(all_of ArchV79)>;
+def ArchV81: SubtargetFeature<"v81", "HexagonArchVersion", "Hexagon::ArchEnum::V81", "Enable Hexagon V81 architecture">;
+def HasV81 : Predicate<"HST->hasV81Ops()">, AssemblerPredicate<(all_of ArchV81)>;
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
index 93696e0..f4e36fa7 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -7222,3 +7222,595 @@ class DepHVXItinV79 {
       [Hex_FWD, Hex_FWD, HVX_FWD]>
   ];
 }
+
+class DepHVXItinV81 {
+  list<InstrItinData> DepHVXItinV81_list = [
+    InstrItinData <tc_0390c1ca, /*SLOT01,LOAD,VA,VX_DV*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
+
+    InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_227864f7, /*SLOT0,STORE,VA,VX_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+       InstrStage<1, [CVI_MPY01]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_37820f4c, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_4942646a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_531b383c, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_540c3da3, /*SLOT0,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_56e64202, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_649072c2, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7095ecba, /*SLOT01,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7177e272, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_72e2b393, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_73efe966, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7417e785, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7d68d5c2, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_8772086c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_946013d8, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9a1cab75, /*SLOT01,LOAD,VA,VX_DV*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9aff7a2a, /*SLOT0,STORE,VA,VX_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+       InstrStage<1, [CVI_MPY01]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a19b9305, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a28f32b5, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a69eeee1, /*SLOT01,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac4046bc, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b091f1c6, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c127de3a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c4edf264, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cda936da, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_dcca380f, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e2fdd6e6, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e699ae41, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_f175e046, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>
+  ];
+}
+\ No newline at end of file
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
index 7a1ad3e..48b665c 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
@@ -13740,3 +13740,891 @@ class DepScalarItinV79 {
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
   ];
 }
+
+class DepScalarItinV81 {
+  list<InstrItinData> DepScalarItinV81_list = [
+    InstrItinData <tc_011e0e9d, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01d44cb2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01e1be3b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_02fe1c65, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0655b949, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_075c8dd8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0a195f2c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0a43be35, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0a6c20ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ba0d5da, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0dfac0a7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0fac1eb8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_112d30d6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1242dc2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1248597c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_139ef484, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_151bf368, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_158aa3f7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_197dce51, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1981450d, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1c2c7a4a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c7522a8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1d41f8b7, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fcb8495, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fe4ab69, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20131976, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2237d952, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_23708a21, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_2471c1c8, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_24e109c7, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_24f426ab, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_27106296, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_280f7fe1, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_28e55c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c13e7f5, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c3e17fc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_2f573607, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_33e7e673, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_362b0be2, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_38382228, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_388f9897, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_38e0bae9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d14a17b, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3edca78f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3fbf1042, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_407e96f9, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_40d64c94, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4222e6bf, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_42ff66ba, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_442395f3, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_449acf79, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44d5a428, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44fffc58, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_45791fb8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_45f9d1be, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_46c18ecf, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_49fdfd4b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4a55d03c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4abdbdc6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4bf903b0, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_503ce0f3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_512b1653, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_53c851ab, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_54f0cee2, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_5502c366, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55255f2b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_556f6577, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55a9a350, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55b33fda, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56a124a7, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_57a55b54, /*tc_1*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5944960d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_59a7822c, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a222e89, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a4b5e58, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5b347363, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5ceb2f9e, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5da50c4b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5deb5e47, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5e4cf0e8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_60e324ff, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_63567288, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_64b00d8a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_651cbe02, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65279839, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65cbd974, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_69bfb303, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6aa823ab, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6ae3426b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6d861a95, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6e20402a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6f42bc60, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fb52018, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fc5dbea, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_711c805f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_713b66bf, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7401744f, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7476d766, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_74a42bda, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_759e57be, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76bb5435, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7d6a2568, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_77f94a5e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_788b1d09, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_78f87ed3, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_7af3a37e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7b9187d3, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c28bd7e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_7c31e19a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c6d32e4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7dc63b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f58404a, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_7f7f45f5, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f8ae742, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8035e91f, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_822c3c68, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_829d8a86, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_838c4d7a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_84a7500d, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_86173609, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_887d1bb7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8a6d0d94, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8a825db2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8b5bd4f5, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e82e8ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8f36a2fd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9124c04f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_92240447, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_934753bb, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_937dd41c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
+
+    InstrItinData <tc_9406230a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_95a33176, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_95f43c5e, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_96ef76ef, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_975a4e54, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9783714b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9b20a062, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9b34f5e0, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_9b3c0462, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9bcfb2ee, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c52f549, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e27f2f9, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e72dc89, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edb7c77, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edefe01, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f6cd987, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a08b630b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1297125, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a154b476, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a2b365d2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a3070909, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a32e03e7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a38c45dc, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4e22bbd, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4ee89db, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_a724463d, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7a13fac, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7bdb22c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a9edeffa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_abfd9a6d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac65613f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_addc37a8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae5babd7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_aee6250c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_af6af259, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b1ae5f67, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b2196a3f, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b3d46584, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_b4dc7630, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b7c4062a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b837298f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_b9bec29e, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_ba9255a6, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb07f2c5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb78483e, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb831a7c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c20701f0, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c21d7447, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c57d9f39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c818ff7f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_ce59038e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfa0e29b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d03278fd, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d234b61a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d33e5eee, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d3632d88, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d45ba9cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d57d649c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d61dfdc3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d68dca5c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d71ea8fa, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d7718fbe, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_db596beb, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db96aa6b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_dc51281d, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_decdde8a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e3d699e3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e60def48, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_e9170fb7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ed03645c, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ed3f8d2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eed07714, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eeda4109, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef921005, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f098b237, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0cdeccf, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0e8e832, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f34c1c21, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f38f92e1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f529831b, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f7569068, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f97707c1, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f999c66e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fae9dfa5, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fedb7e19, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
+  ];
+}
+\ No newline at end of file
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
index ae96753..f8f1c2a 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -39178,6 +39178,19 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vsub_hf_mix : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vsub($Vu32.hf,$Vv32.qf16)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vsub_qf16 : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -39269,6 +39282,19 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vsub_sf_mix : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf32 = vsub($Vu32.sf,$Vv32.qf32)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vsub_sf_sf : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -41116,6 +41142,17 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let isSolo = 1;
 }
+def Y2_tlbpp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = tlbp($Rss32)",
+tc_6aa823ab, TypeCR>, Enc_90cd8b, Requires<[HasV81]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isSolo = 1;
+}
 def Y2_tlbr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index 17cb96c..23f4b3a 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -3827,3 +3827,14 @@ def: Pat<(int_hexagon_V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2),
          (V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsub_hf_f8_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX128B]>;
+
+// V81 HVX Instructions.
+
+def: Pat<(int_hexagon_V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_hf_mix_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_sf_mix_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index e285e04..7ee280d 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -654,7 +654,9 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
       IntNo == Intrinsic::hexagon_V6_vgathermh ||
       IntNo == Intrinsic::hexagon_V6_vgathermh_128B ||
       IntNo == Intrinsic::hexagon_V6_vgathermhw ||
-      IntNo == Intrinsic::hexagon_V6_vgathermhw_128B) {
+      IntNo == Intrinsic::hexagon_V6_vgathermhw_128B ||
+      IntNo == Intrinsic::hexagon_V6_vgather_vscattermh ||
+      IntNo == Intrinsic::hexagon_V6_vgather_vscattermh_128B) {
     SelectV65Gather(N);
     return;
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index c7a4f68..3cc146b 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -2953,6 +2953,10 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
   case Intrinsic::hexagon_V6_vgathermhw_128B:
     Opcode = Hexagon::V6_vgathermhw_pseudo;
     break;
+  case Intrinsic::hexagon_V6_vgather_vscattermh:
+  case Intrinsic::hexagon_V6_vgather_vscattermh_128B:
+    Opcode = Hexagon::V6_vgather_vscatter_mh_pseudo;
+    break;
   }
 
   SDVTList VTs = CurDAG->getVTList(MVT::Other);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 9f7f434..526b4de 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2145,7 +2145,9 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::hexagon_V6_vgathermhq:
   case Intrinsic::hexagon_V6_vgathermhq_128B:
   case Intrinsic::hexagon_V6_vgathermhwq:
-  case Intrinsic::hexagon_V6_vgathermhwq_128B: {
+  case Intrinsic::hexagon_V6_vgathermhwq_128B:
+  case Intrinsic::hexagon_V6_vgather_vscattermh:
+  case Intrinsic::hexagon_V6_vgather_vscattermh_128B: {
     const Module &M = *I.getParent()->getParent()->getParent();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Type *VecTy = I.getArgOperand(1)->getType();
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 939841a..47726d6 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1554,80 +1554,93 @@ HexagonInstrInfo::expandVGatherPseudo(MachineInstr &MI) const {
   MachineBasicBlock::iterator First;
 
   switch (Opc) {
-    case Hexagon::V6_vgathermh_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermw_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermhw_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermhq_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4))
-                  .add(MI.getOperand(5));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermwq_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4))
-                  .add(MI.getOperand(5));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermhwq_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4))
-                  .add(MI.getOperand(5));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
+  case Hexagon::V6_vgather_vscatter_mh_pseudo:
+    // This is mainly a place holder. It will be extended.
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vscattermh))
+        .add(MI.getOperand(2))
+        .add(MI.getOperand(3))
+        .add(MI.getOperand(4))
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+  case Hexagon::V6_vgathermh_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermw_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermhw_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermhq_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4))
+                .add(MI.getOperand(5));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermwq_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4))
+                .add(MI.getOperand(5));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermhwq_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4))
+                .add(MI.getOperand(5));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
   }
 
   return MI.getIterator();
@@ -2806,6 +2819,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::V6_vL32b_nt_tmp_npred_ai:
   case Hexagon::V6_vS32Ub_npred_ai:
   case Hexagon::V6_vgathermh_pseudo:
+  case Hexagon::V6_vgather_vscatter_mh_pseudo:
   case Hexagon::V6_vgathermw_pseudo:
   case Hexagon::V6_vgathermhw_pseudo:
   case Hexagon::V6_vgathermhq_pseudo:
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
index f927f9b..42393d0 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
@@ -40,6 +40,19 @@ defm V6_vgathermh_pseudo  : vgathermh<HvxVR>;
 defm V6_vgathermw_pseudo  : vgathermw<HvxVR>;
 defm V6_vgathermhw_pseudo  : vgathermhw<HvxWR>;
 
+
+multiclass vgather_scatter_mh<RegisterClass RC> {
+  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
+  mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in
+  def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ),
+                           (ins IntRegs:$_dst_, s4_0Imm:$Ii,
+                                IntRegs:$Rt, ModRegs:$Mu, RC:$Vv),
+                           ".error \"should not emit\" ",
+                           []>;
+}
+
+defm V6_vgather_vscatter_mh_pseudo  : vgather_scatter_mh<HvxVR>;
+
 multiclass vgathermhq<RegisterClass RC1, RegisterClass RC2> {
   let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
   mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in
diff --git a/llvm/lib/Target/Hexagon/HexagonSchedule.td b/llvm/lib/Target/Hexagon/HexagonSchedule.td
index b8a9cf3..9bcd4bf 100644
--- a/llvm/lib/Target/Hexagon/HexagonSchedule.td
+++ b/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -75,3 +75,4 @@ include "HexagonScheduleV71T.td"
 include "HexagonScheduleV73.td"
 include "HexagonScheduleV75.td"
 include "HexagonScheduleV79.td"
+include "HexagonScheduleV81.td"
+\ No newline at end of file
diff --git a/llvm/lib/Target/Hexagon/HexagonScheduleV81.td b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td
new file mode 100644
index 0000000..dd5f5a0
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td
@@ -0,0 +1,31 @@
+//=-HexagonScheduleV81.td - HexagonV81 Scheduling Definitions *- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def HexagonV81ItinList : DepScalarItinV81, ScalarItin,
+                         DepHVXItinV81, HVXItin, PseudoItin {
+  list<InstrItinData> ItinList =
+    !listconcat(DepScalarItinV81_list, ScalarItin_list,
+                DepHVXItinV81_list, HVXItin_list, PseudoItin_list);
+}
+
+def HexagonItinerariesV81 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
+                            CVI_ALL_NOMEM, CVI_ZW],
+                            [Hex_FWD, HVX_FWD],
+                            HexagonV81ItinList.ItinList>;
+
+def HexagonModelV81 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV81;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 7430567..995f66d 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -224,6 +224,15 @@ public:
   bool useHVXV79Ops() const {
     return HexagonHVXVersion >= Hexagon::ArchEnum::V79;
   }
+  bool hasV81Ops() const {
+    return getHexagonArchVersion() >= Hexagon::ArchEnum::V81;
+  }
+  bool hasV81OpsOnly() const {
+    return getHexagonArchVersion() == Hexagon::ArchEnum::V81;
+  }
+  bool useHVXV81Ops() const {
+    return HexagonHVXVersion >= Hexagon::ArchEnum::V81;
+  }
 
   bool useAudioOps() const { return UseAudioOps; }
   bool useCompound() const { return UseCompound; }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 171e294..e925e04 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -31,6 +31,10 @@ using namespace llvm;
 static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),
     cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
 
+cl::opt<bool> HexagonAllowScatterGatherHVX(
+    "hexagon-allow-scatter-gather-hvx", cl::init(false), cl::Hidden,
+    cl::desc("Allow auto-generation of HVX scatter-gather"));
+
 static cl::opt<bool> EnableV68FloatAutoHVX(
     "force-hvx-float", cl::Hidden,
     cl::desc("Enable auto-vectorization of floatint point types on v68."));
@@ -354,6 +358,61 @@ bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/,
   return HexagonMaskedVMem && ST.isTypeForHVX(DataType);
 }
 
+bool HexagonTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
+  // For now assume we can not deal with all HVX datatypes.
+  if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) ||
+      !HexagonAllowScatterGatherHVX)
+    return false;
+  // This must be in sync with HexagonVectorCombine pass.
+  switch (Ty->getScalarSizeInBits()) {
+  case 8:
+    return (getTypeNumElements(Ty) == 128);
+  case 16:
+    if (getTypeNumElements(Ty) == 64 || getTypeNumElements(Ty) == 32)
+      return (Alignment >= 2);
+    break;
+  case 32:
+    if (getTypeNumElements(Ty) == 32)
+      return (Alignment >= 4);
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
+bool HexagonTTIImpl::isLegalMaskedScatter(Type *Ty, Align Alignment) const {
+  if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) ||
+      !HexagonAllowScatterGatherHVX)
+    return false;
+  // This must be in sync with HexagonVectorCombine pass.
+  switch (Ty->getScalarSizeInBits()) {
+  case 8:
+    return (getTypeNumElements(Ty) == 128);
+  case 16:
+    if (getTypeNumElements(Ty) == 64)
+      return (Alignment >= 2);
+    break;
+  case 32:
+    if (getTypeNumElements(Ty) == 32)
+      return (Alignment >= 4);
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
+bool HexagonTTIImpl::forceScalarizeMaskedGather(VectorType *VTy,
+                                                Align Alignment) const {
+  return !isLegalMaskedGather(VTy, Alignment);
+}
+
+bool HexagonTTIImpl::forceScalarizeMaskedScatter(VectorType *VTy,
+                                                 Align Alignment) const {
+  return !isLegalMaskedScatter(VTy, Alignment);
+}
+
 /// --- Vector TTI end ---
 
 unsigned HexagonTTIImpl::getPrefetchDistance() const {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index dbf16c9..cec2bf9 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -169,6 +169,12 @@ public:
                           unsigned AddressSpace) const override;
   bool isLegalMaskedLoad(Type *DataType, Align Alignment,
                          unsigned AddressSpace) const override;
+  bool isLegalMaskedGather(Type *Ty, Align Alignment) const override;
+  bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override;
+  bool forceScalarizeMaskedGather(VectorType *VTy,
+                                  Align Alignment) const override;
+  bool forceScalarizeMaskedScatter(VectorType *VTy,
+                                   Align Alignment) const override;
 
   /// @}
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 9ab5202..5c50ec2 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -57,6 +57,11 @@
 
 #define DEBUG_TYPE "hexagon-vc"
 
+// This is a const that represents default HVX VTCM page size.
+// It is boot time configurable, so we probably want an API to
+// read it, but for now assume 128KB
+#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072
+
 using namespace llvm;
 
 namespace {
@@ -418,6 +423,18 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
 
 class HvxIdioms {
 public:
+  enum DstQualifier {
+    Undefined = 0,
+    Arithmetic,
+    LdSt,
+    LLVM_Gather,
+    LLVM_Scatter,
+    HEX_Gather_Scatter,
+    HEX_Gather,
+    HEX_Scatter,
+    Call
+  };
+
   HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
     auto *Int32Ty = HVC.getIntTy(32);
     HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
@@ -473,6 +490,11 @@ private:
   auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
                      Signedness SgnX, ArrayRef<Value *> WordY,
                      Signedness SgnY) const -> SmallVector<Value *>;
+  // Vector manipulations for Ripple
+  bool matchScatter(Instruction &In) const;
+  bool matchGather(Instruction &In) const;
+  Value *processVScatter(Instruction &In) const;
+  Value *processVGather(Instruction &In) const;
 
   VectorType *HvxI32Ty;
   VectorType *HvxP32Ty;
@@ -1545,7 +1567,7 @@ auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
 }
 
 auto AlignVectors::run() -> bool {
-  LLVM_DEBUG(dbgs() << "Running HVC::AlignVectors on " << HVC.F.getName()
+  LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()
                     << '\n');
   if (!createAddressGroups())
     return false;
@@ -1797,6 +1819,846 @@ auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
   return Ext;
 }
 
+inline bool HvxIdioms::matchScatter(Instruction &In) const {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
+  if (!II)
+    return false;
+  return (II->getIntrinsicID() == Intrinsic::masked_scatter);
+}
+
+inline bool HvxIdioms::matchGather(Instruction &In) const {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
+  if (!II)
+    return false;
+  return (II->getIntrinsicID() == Intrinsic::masked_gather);
+}
+
+Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);
+
+// Binary instructions we want to handle as users of gather/scatter.
+inline bool isArithmetic(unsigned Opc) {
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::AShr:
+  case Instruction::LShr:
+  case Instruction::Shl:
+  case Instruction::UDiv:
+    return true;
+  }
+  return false;
+}
+
+// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.
+inline Value *getPointer(Value *Ptr) {
+  assert(Ptr && "Unable to extract pointer");
+  if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr))
+    return Ptr;
+  if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr))
+    return getLoadStorePointerOperand(Ptr);
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_store)
+      return II->getOperand(1);
+  }
+  return nullptr;
+}
+
+static Instruction *selectDestination(Instruction *In,
+                                      HvxIdioms::DstQualifier &Qual) {
+  Instruction *Destination = nullptr;
+  if (!In)
+    return Destination;
+  if (isa<StoreInst>(In)) {
+    Destination = In;
+    Qual = HvxIdioms::LdSt;
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_gather) {
+      Destination = In;
+      Qual = HvxIdioms::LLVM_Gather;
+    } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {
+      Destination = In;
+      Qual = HvxIdioms::LLVM_Scatter;
+    } else if (II->getIntrinsicID() == Intrinsic::masked_store) {
+      Destination = In;
+      Qual = HvxIdioms::LdSt;
+    } else if (II->getIntrinsicID() ==
+               Intrinsic::hexagon_V6_vgather_vscattermh) {
+      Destination = In;
+      Qual = HvxIdioms::HEX_Gather_Scatter;
+    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {
+      Destination = In;
+      Qual = HvxIdioms::HEX_Scatter;
+    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {
+      Destination = In;
+      Qual = HvxIdioms::HEX_Gather;
+    }
+  } else if (isa<ZExtInst>(In)) {
+    return locateDestination(In, Qual);
+  } else if (isa<CastInst>(In)) {
+    return locateDestination(In, Qual);
+  } else if (isa<CallInst>(In)) {
+    Destination = In;
+    Qual = HvxIdioms::Call;
+  } else if (isa<GetElementPtrInst>(In)) {
+    return locateDestination(In, Qual);
+  } else if (isArithmetic(In->getOpcode())) {
+    Destination = In;
+    Qual = HvxIdioms::Arithmetic;
+  } else {
+    LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");
+  }
+  return Destination;
+}
+
+// This method attempts to find destination (user) for a given intrinsic.
+// Given that these are produced only by Ripple, the number of options is
+// limited. Simplest case is explicit store which in fact is redundant (since
+// HVX gater creates its own store during packetization). Nevertheless we need
+// to figure address where we storing. Other cases are more complicated, but
+// still few.
+Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {
+  Instruction *Destination = nullptr;
+  if (!In)
+    return Destination;
+  // Get all possible destinations
+  SmallVector<Instruction *> Users;
+  // Iterate over the uses of the instruction
+  for (auto &U : In->uses()) {
+    if (auto *UI = dyn_cast<Instruction>(U.getUser())) {
+      Destination = selectDestination(UI, Qual);
+      if (Destination)
+        Users.push_back(Destination);
+    }
+  }
+  // Now see which of the users (if any) is a memory destination.
+  for (auto *I : Users)
+    if (getPointer(I))
+      return I;
+  return Destination;
+}
+
+// The two intrinsics we handle here have GEP in a different position.
+inline GetElementPtrInst *locateGepFromIntrinsic(Instruction *In) {
+  assert(In && "Bad instruction");
+  IntrinsicInst *IIn = dyn_cast<IntrinsicInst>(In);
+  assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||
+                  IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&
+         "Not a gather Intrinsic");
+  GetElementPtrInst *GEPIndex = nullptr;
+  if (IIn->getIntrinsicID() == Intrinsic::masked_gather)
+    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0));
+  else
+    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1));
+  return GEPIndex;
+}
+
+// Given the intrinsic find its GEP argument and extract base address it uses.
+// The method relies on the way how Ripple typically forms the GEP for
+// scatter/gather.
+static Value *locateAddressFromIntrinsic(Instruction *In) {
+  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
+  if (!GEPIndex) {
+    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n");
+    return nullptr;
+  }
+  Value *BaseAddress = GEPIndex->getPointerOperand();
+  auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress);
+  if (IndexLoad)
+    return IndexLoad;
+
+  auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress);
+  if (IndexZEx) {
+    IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0));
+    if (IndexLoad)
+      return IndexLoad;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0));
+    if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
+      return locateAddressFromIntrinsic(II);
+  }
+  auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress);
+  if (BaseShuffle) {
+    IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0));
+    if (IndexLoad)
+      return IndexLoad;
+    auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0));
+    if (IE) {
+      auto *Src = IE->getOperand(1);
+      IndexLoad = dyn_cast<LoadInst>(Src);
+      if (IndexLoad)
+        return IndexLoad;
+      auto *Alloca = dyn_cast<AllocaInst>(Src);
+      if (Alloca)
+        return Alloca;
+      if (isa<Argument>(Src)) {
+        return Src;
+      }
+      if (isa<GlobalValue>(Src)) {
+        return Src;
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "  Unable to locate Address from intrinsic\n");
+  return nullptr;
+}
+
+static Type *getIndexType(Value *In) {
+  if (!In)
+    return nullptr;
+
+  if (isa<LoadInst>(In) || isa<StoreInst>(In))
+    return getLoadStoreType(In);
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_load)
+      return II->getType();
+    if (II->getIntrinsicID() == Intrinsic::masked_store)
+      return II->getOperand(0)->getType();
+  }
+  return In->getType();
+}
+
+static Value *locateIndexesFromGEP(Value *In) {
+  if (!In)
+    return nullptr;
+  if (isa<LoadInst>(In))
+    return In;
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_load)
+      return In;
+    if (II->getIntrinsicID() == Intrinsic::masked_gather)
+      return In;
+  }
+  if (auto *IndexZEx = dyn_cast<ZExtInst>(In))
+    return locateIndexesFromGEP(IndexZEx->getOperand(0));
+  if (auto *IndexSEx = dyn_cast<SExtInst>(In))
+    return locateIndexesFromGEP(IndexSEx->getOperand(0));
+  if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In))
+    return locateIndexesFromGEP(BaseShuffle->getOperand(0));
+  if (auto *IE = dyn_cast<InsertElementInst>(In))
+    return locateIndexesFromGEP(IE->getOperand(1));
+  if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In))
+    return cstDataVector;
+  if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In))
+    return GEPIndex->getOperand(0);
+  return nullptr;
+}
+
+// Given the intrinsic find its GEP argument and extract offsetts from the base
+// address it uses.
+static Value *locateIndexesFromIntrinsic(Instruction *In) {
+  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
+  if (!GEPIndex) {
+    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n");
+    return nullptr;
+  }
+  Value *Indexes = GEPIndex->getOperand(1);
+  if (auto *IndexLoad = locateIndexesFromGEP(Indexes))
+    return IndexLoad;
+
+  LLVM_DEBUG(dbgs() << "  Unable to locate Index from intrinsic\n");
+  return nullptr;
+}
+
+// Because of aukward definition of many Hex intrinsics we often have to
+// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP
+// for all use cases, so this only exist to make IR builder happy.
+inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,
+                                               IRBuilderBase &Builder,
+                                               LLVMContext &Ctx, Value *I) {
+  assert(I && "Unable to reinterprete cast");
+  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+  std::vector<unsigned> shuffleMask;
+  for (unsigned i = 0; i < 64; ++i)
+    shuffleMask.push_back(i);
+  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
+  Value *CastShuffle =
+      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
+  return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32");
+}
+
+// Recast <128 x i8> as <32 x i32>
+inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,
+                                              IRBuilderBase &Builder,
+                                              LLVMContext &Ctx, Value *I) {
+  assert(I && "Unable to reinterprete cast");
+  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+  std::vector<unsigned> shuffleMask;
+  for (unsigned i = 0; i < 128; ++i)
+    shuffleMask.push_back(i);
+  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
+  Value *CastShuffle =
+      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
+  return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32");
+}
+
+// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern
+inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,
+                           IRBuilderBase &Builder, LLVMContext &Ctx,
+                           unsigned int pattern) {
+  std::vector<unsigned int> byteMask;
+  for (unsigned i = 0; i < 32; ++i)
+    byteMask.push_back(pattern);
+
+  return Builder.CreateIntrinsic(
+      HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt),
+      {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)},
+      nullptr);
+}
+
+Value *HvxIdioms::processVScatter(Instruction &In) const {
+  auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType());
+  assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");
+  unsigned InpSize = HVC.getSizeOf(InpTy);
+  auto *F = In.getFunction();
+  LLVMContext &Ctx = F->getContext();
+  auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType());
+  assert(ElemTy && "llvm.scatter needs integer type argument");
+  unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy);
+  LLVM_DEBUG({
+    unsigned Elements = HVC.length(InpTy);
+    dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n";
+    dbgs() << "  Input type(" << *InpTy << ") elements(" << Elements
+           << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth("
+           << ElemWidth << ")\n";
+  });
+
+  IRBuilder Builder(In.getParent(), In.getIterator(),
+                    InstSimplifyFolder(HVC.DL));
+
+  auto *ValueToScatter = In.getOperand(0);
+  LLVM_DEBUG(dbgs() << "  ValueToScatter   : " << *ValueToScatter << "\n");
+
+  if (HVC.HST.getVectorLength() != InpSize) {
+    LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize
+                      << ") for vscatter\n");
+    return nullptr;
+  }
+
+  // Base address of indexes.
+  auto *IndexLoad = locateAddressFromIntrinsic(&In);
+  if (!IndexLoad)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  IndexLoad        : " << *IndexLoad << "\n");
+
+  // Address of destination. Must be in VTCM.
+  auto *Ptr = getPointer(IndexLoad);
+  if (!Ptr)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  Ptr              : " << *Ptr << "\n");
+  // Indexes/offsets
+  auto *Indexes = locateIndexesFromIntrinsic(&In);
+  if (!Indexes)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  Indexes          : " << *Indexes << "\n");
+  Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx),
+                                                    "cst_ptr_to_i32");
+  LLVM_DEBUG(dbgs() << "  CastedDst        : " << *CastedDst << "\n");
+  // Adjust Indexes
+  auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+  Value *CastIndex = nullptr;
+  if (cstDataVector) {
+    // Our indexes are represented as a constant. We need it in a reg.
+    AllocaInst *IndexesAlloca =
+        Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false));
+    [[maybe_unused]] auto *StoreIndexes =
+        Builder.CreateStore(cstDataVector, IndexesAlloca);
+    LLVM_DEBUG(dbgs() << "  StoreIndexes     : " << *StoreIndexes << "\n");
+    CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(),
+                                   IndexesAlloca, "reload_index");
+  } else {
+    if (ElemWidth == 2)
+      CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+    else
+      CastIndex = Indexes;
+  }
+  LLVM_DEBUG(dbgs() << "  Cast index       : " << *CastIndex << ")\n");
+
+  if (ElemWidth == 1) {
+    // v128i8 There is no native instruction for this.
+    // Do this as two Hi/Lo gathers with masking.
+    Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+    // Extend indexes. We assume that indexes are in 128i8 format - need to
+    // expand them to Hi/Lo 64i16
+    Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32");
+    auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
+    auto *UnpackedIndexes = Builder.CreateIntrinsic(
+        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr);
+    LLVM_DEBUG(dbgs() << "  UnpackedIndexes  : " << *UnpackedIndexes << ")\n");
+
+    auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
+    auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
+    [[maybe_unused]] Value *IndexHi =
+        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
+    [[maybe_unused]] Value *IndexLo =
+        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
+    LLVM_DEBUG(dbgs() << "  UnpackedIndHi    : " << *IndexHi << ")\n");
+    LLVM_DEBUG(dbgs() << "  UnpackedIndLo    : " << *IndexLo << ")\n");
+    // Now unpack values to scatter
+    Value *CastSrc =
+        getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter);
+    LLVM_DEBUG(dbgs() << "  CastSrc          : " << *CastSrc << ")\n");
+    auto *UnpackedValueToScatter = Builder.CreateIntrinsic(
+        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr);
+    LLVM_DEBUG(dbgs() << "  UnpackedValToScat: " << *UnpackedValueToScatter
+                      << ")\n");
+
+    [[maybe_unused]] Value *UVSHi =
+        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter);
+    [[maybe_unused]] Value *UVSLo =
+        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter);
+    LLVM_DEBUG(dbgs() << "  UVSHi            : " << *UVSHi << ")\n");
+    LLVM_DEBUG(dbgs() << "  UVSLo            : " << *UVSLo << ")\n");
+
+    // Create the mask for individual bytes
+    auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
+    LLVM_DEBUG(dbgs() << "  QByteMask        : " << *QByteMask << "\n");
+    [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
+        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         IndexHi, UVSHi},
+        nullptr);
+    LLVM_DEBUG(dbgs() << "  ResHi            : " << *ResHi << ")\n");
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
+        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         IndexLo, UVSLo},
+        nullptr);
+  } else if (ElemWidth == 2) {
+    Value *CastSrc =
+        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter);
+    LLVM_DEBUG(dbgs() << "  CastSrc        : " << *CastSrc << ")\n");
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B,
+        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
+         CastSrc},
+        nullptr);
+  } else if (ElemWidth == 4) {
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B,
+        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
+         ValueToScatter},
+        nullptr);
+  } else {
+    LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");
+    return nullptr;
+  }
+}
+
+Value *HvxIdioms::processVGather(Instruction &In) const {
+  [[maybe_unused]] auto *InpTy =
+      dyn_cast<VectorType>(In.getOperand(0)->getType());
+  assert(InpTy && "Cannot handle no vector type for llvm.gather");
+  [[maybe_unused]] auto *ElemTy =
+      dyn_cast<PointerType>(InpTy->getElementType());
+  assert(ElemTy && "llvm.gather needs vector of ptr argument");
+  auto *F = In.getFunction();
+  LLVMContext &Ctx = F->getContext();
+  LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"
+                    << *In.getParent() << "\n");
+  LLVM_DEBUG(dbgs() << "  Input type(" << *InpTy << ") elements("
+                    << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
+                    << ") type(" << *ElemTy << ") Access alignment("
+                    << *In.getOperand(1) << ") AddressSpace("
+                    << ElemTy->getAddressSpace() << ")\n");
+
+  // TODO: Handle masking of elements.
+  assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) &&
+         "llvm.gather needs vector for mask");
+  IRBuilder Builder(In.getParent(), In.getIterator(),
+                    InstSimplifyFolder(HVC.DL));
+
+  // See who is using the result. The difference between LLVM and HVX vgather
+  // Intrinsic makes it impossible to handle all cases with temp storage. Alloca
+  // in VTCM is not yet supported, so for now we just bail out for those cases.
+  HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;
+  Instruction *Dst = locateDestination(&In, Qual);
+  if (!Dst) {
+    LLVM_DEBUG(dbgs() << "  Unable to locate vgather destination\n");
+    return nullptr;
+  }
+  LLVM_DEBUG(dbgs() << "  Destination    : " << *Dst << " Qual(" << Qual
+                    << ")\n");
+
+  // Address of destination. Must be in VTCM.
+  auto *Ptr = getPointer(Dst);
+  if (!Ptr) {
+    LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");
+    return nullptr;
+  }
+
+  // Result type. Assume it is a vector type.
+  auto *DstType = cast<VectorType>(getIndexType(Dst));
+  assert(DstType && "Cannot handle non vector dst type for llvm.gather");
+
+  // Base address for sources to be loaded
+  auto *IndexLoad = locateAddressFromIntrinsic(&In);
+  if (!IndexLoad)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  IndexLoad      : " << *IndexLoad << "\n");
+
+  // Gather indexes/offsets
+  auto *Indexes = locateIndexesFromIntrinsic(&In);
+  if (!Indexes)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  Indexes        : " << *Indexes << "\n");
+
+  Instruction *Gather = nullptr;
+  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+  if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {
+    // We fully assume the address space is in VTCM. We also assume that all
+    // pointers in Operand(0) have the same base(!).
+    // This is the most basic case of all the above.
+    unsigned OutputSize = HVC.getSizeOf(DstType);
+    auto *DstElemTy = cast<IntegerType>(DstType->getElementType());
+    unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy);
+    LLVM_DEBUG(dbgs() << "  Buffer type    : " << *Ptr->getType()
+                      << "  Address space ("
+                      << Ptr->getType()->getPointerAddressSpace() << ")\n"
+                      << "  Result type    : " << *DstType
+                      << "\n  Size in bytes  : " << OutputSize
+                      << " element type(" << *DstElemTy
+                      << ")\n  ElemWidth      : " << ElemWidth << " bytes\n");
+
+    auto *IndexType = cast<VectorType>(getIndexType(Indexes));
+    assert(IndexType && "Cannot handle non vector index type for llvm.gather");
+    unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType());
+    LLVM_DEBUG(dbgs() << "  IndexWidth(" << IndexWidth << ")\n");
+
+    // Intrinsic takes i32 instead of pointer so cast.
+    Value *CastedPtr = Builder.CreateBitOrPointerCast(
+        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+    // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]
+    // int_hexagon_V6_vgathermh       [... , llvm_v16i32_ty]
+    // int_hexagon_V6_vgathermh_128B  [... , llvm_v32i32_ty]
+    // int_hexagon_V6_vgathermhw      [... , llvm_v32i32_ty]
+    // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]
+    // int_hexagon_V6_vgathermw       [... , llvm_v16i32_ty]
+    // int_hexagon_V6_vgathermw_128B  [... , llvm_v32i32_ty]
+    if (HVC.HST.getVectorLength() == OutputSize) {
+      if (ElemWidth == 1) {
+        // v128i8 There is no native instruction for this.
+        // Do this as two Hi/Lo gathers with masking.
+        // Unpack indexes. We assume that indexes are in 128i8 format - need to
+        // expand them to Hi/Lo 64i16
+        Value *CastIndexes =
+            Builder.CreateBitCast(Indexes, NT, "cast_to_32i32");
+        auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
+        auto *UnpackedIndexes =
+            Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true),
+                                    V6_vunpack, CastIndexes, nullptr);
+        LLVM_DEBUG(dbgs() << "  UnpackedIndexes : " << *UnpackedIndexes
+                          << ")\n");
+
+        auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
+        auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
+        [[maybe_unused]] Value *IndexHi =
+            HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
+        [[maybe_unused]] Value *IndexLo =
+            HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
+        LLVM_DEBUG(dbgs() << "  UnpackedIndHi   : " << *IndexHi << ")\n");
+        LLVM_DEBUG(dbgs() << "  UnpackedIndLo   : " << *IndexLo << ")\n");
+        // Create the mask for individual bytes
+        auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
+        LLVM_DEBUG(dbgs() << "  QByteMask       : " << *QByteMask << "\n");
+        // We use our destination allocation as a temp storage
+        // This is unlikely to work properly for masked gather.
+        auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq);
+        [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), V6_vgather,
+            {Ptr, QByteMask, CastedPtr,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},
+            nullptr);
+        LLVM_DEBUG(dbgs() << "  GatherHi        : " << *GatherHi << ")\n");
+        // Rematerialize the result
+        [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi");
+        LLVM_DEBUG(dbgs() << "  LoadedResultHi : " << *LoadedResultHi << "\n");
+        // Same for the low part. Here we use Gather to return non-NULL result
+        // from this function and continue to iterate. We also are deleting Dst
+        // store below.
+        Gather = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), V6_vgather,
+            {Ptr, QByteMask, CastedPtr,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},
+            nullptr);
+        LLVM_DEBUG(dbgs() << "  GatherLo        : " << *Gather << ")\n");
+        Value *LoadedResultLo = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo");
+        LLVM_DEBUG(dbgs() << "  LoadedResultLo : " << *LoadedResultLo << "\n");
+        // Now we have properly sized bytes in every other position
+        // B b A a c a A b B c f F g G h H is presented as
+        // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H
+        // Use vpack to gather them
+        auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb);
+        [[maybe_unused]] auto Res = Builder.CreateIntrinsic(
+            NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr);
+        LLVM_DEBUG(dbgs() << "  ScaledRes      : " << *Res << "\n");
+        [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr);
+        LLVM_DEBUG(dbgs() << "  StoreRes       : " << *StoreRes << "\n");
+      } else if (ElemWidth == 2) {
+        // v32i16
+        if (IndexWidth == 2) {
+          // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.
+          Value *CastIndex =
+              getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+          LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n");
+          // shift all i16 left by 1 to match short addressing mode instead of
+          // byte.
+          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+          Value *AdjustedIndex = HVC.createHvxIntrinsic(
+              Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
+          LLVM_DEBUG(dbgs()
+                     << "  Shifted half index: " << *AdjustedIndex << ")\n");
+
+          auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh);
+          // The 3rd argument is the size of the region to gather from. Probably
+          // want to set it to max VTCM size.
+          Gather = Builder.CreateIntrinsic(
+              Type::getVoidTy(Ctx), V6_vgather,
+              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+               AdjustedIndex},
+              nullptr);
+          for (auto &U : Dst->uses()) {
+            if (auto *UI = dyn_cast<Instruction>(U.getUser()))
+              dbgs() << "    dst used by: " << *UI << "\n";
+          }
+          for (auto &U : In.uses()) {
+            if (auto *UI = dyn_cast<Instruction>(U.getUser()))
+              dbgs() << "    In used by : " << *UI << "\n";
+          }
+          // Create temp load from result in case the result is used by any
+          // other instruction.
+          Value *LoadedResult = Builder.CreateLoad(
+              HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result");
+          LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+          In.replaceAllUsesWith(LoadedResult);
+        } else {
+          dbgs() << "    Unhandled index type for vgather\n";
+          return nullptr;
+        }
+      } else if (ElemWidth == 4) {
+        if (IndexWidth == 4) {
+          // v32i32
+          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+          Value *AdjustedIndex = HVC.createHvxIntrinsic(
+              Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)});
+          LLVM_DEBUG(dbgs()
+                     << "  Shifted word index: " << *AdjustedIndex << ")\n");
+          Gather = Builder.CreateIntrinsic(
+              Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B,
+              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+               AdjustedIndex},
+              nullptr);
+        } else {
+          LLVM_DEBUG(dbgs() << "    Unhandled index type for vgather\n");
+          return nullptr;
+        }
+      } else {
+        LLVM_DEBUG(dbgs() << "    Unhandled element type for vgather\n");
+        return nullptr;
+      }
+    } else if (HVC.HST.getVectorLength() == OutputSize * 2) {
+      // This is half of the reg width, duplicate low in high
+      LLVM_DEBUG(dbgs() << "    Unhandled half of register size\n");
+      return nullptr;
+    } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {
+      LLVM_DEBUG(dbgs() << "    Unhandle twice the register size\n");
+      return nullptr;
+    }
+    // Erase the original intrinsic and store that consumes it.
+    // HVX will create a pseudo for gather that is expanded to gather + store
+    // during packetization.
+    Dst->eraseFromParent();
+  } else if (Qual == HvxIdioms::LLVM_Scatter) {
+    // Gather feeds directly into scatter.
+    LLVM_DEBUG({
+      auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
+      assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
+      unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
+      unsigned DstElements = HVC.length(DstInpTy);
+      auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType());
+      assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
+      dbgs() << "  Gather feeds into scatter\n  Values to scatter : "
+             << *Dst->getOperand(0) << "\n";
+      dbgs() << "  Dst type(" << *DstInpTy << ") elements(" << DstElements
+             << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy
+             << ") Access alignment(" << *Dst->getOperand(2) << ")\n";
+    });
+    // Address of source
+    auto *Src = getPointer(IndexLoad);
+    if (!Src)
+      return nullptr;
+    LLVM_DEBUG(dbgs() << "  Src            : " << *Src << "\n");
+
+    if (!isa<PointerType>(Src->getType())) {
+      LLVM_DEBUG(dbgs() << "    Source is not a pointer type...\n");
+      return nullptr;
+    }
+
+    Value *CastedSrc = Builder.CreateBitOrPointerCast(
+        Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+    LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n");
+
+    auto *DstLoad = locateAddressFromIntrinsic(Dst);
+    if (!DstLoad) {
+      LLVM_DEBUG(dbgs() << "  Unable to locate DstLoad\n");
+      return nullptr;
+    }
+    LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n");
+
+    Value *Ptr = getPointer(DstLoad);
+    if (!Ptr)
+      return nullptr;
+    LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n");
+    Value *CastIndex =
+        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad);
+    LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n");
+    // Shift all i16 left by 1 to match short addressing mode instead of
+    // byte.
+    auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+    Value *AdjustedIndex = HVC.createHvxIntrinsic(
+        Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
+    LLVM_DEBUG(dbgs() << "  Shifted half index: " << *AdjustedIndex << ")\n");
+
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+        {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         AdjustedIndex},
+        nullptr);
+  } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {
+    // Gather feeds into previously inserted pseudo intrinsic.
+    // These could not be in the same packet, so we need to generate another
+    // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo
+    // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,
+    // ModRegs:$Mu, HvxVR:$Vv)
+    if (isa<AllocaInst>(IndexLoad)) {
+      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+      if (cstDataVector) {
+        // Our indexes are represented as a constant. We need THEM in a reg.
+        // This most likely will not work properly since alloca gives us DDR
+        // stack location. This will be fixed once we teach compiler about VTCM.
+        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
+        [[maybe_unused]] auto *StoreIndexes =
+            Builder.CreateStore(cstDataVector, IndexesAlloca);
+        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n");
+        Value *LoadedIndex = Builder.CreateLoad(
+            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
+        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca << "\n");
+
+        Value *CastedSrc = Builder.CreateBitOrPointerCast(
+            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");
+
+        Gather = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+            {ResultAlloca, CastedSrc,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
+            nullptr);
+        Value *LoadedResult = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n");
+        In.replaceAllUsesWith(LoadedResult);
+      }
+    } else {
+      // Address of source
+      auto *Src = getPointer(IndexLoad);
+      if (!Src)
+        return nullptr;
+      LLVM_DEBUG(dbgs() << "  Src      : " << *Src << "\n");
+
+      Value *CastedSrc = Builder.CreateBitOrPointerCast(
+          Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+      LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n");
+
+      auto *DstLoad = locateAddressFromIntrinsic(Dst);
+      if (!DstLoad)
+        return nullptr;
+      LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n");
+      auto *Ptr = getPointer(DstLoad);
+      if (!Ptr)
+        return nullptr;
+      LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n");
+
+      Gather = Builder.CreateIntrinsic(
+          Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh,
+          {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+           Indexes},
+          nullptr);
+    }
+    return Gather;
+  } else if (Qual == HvxIdioms::HEX_Scatter) {
+    // This is the case when result of a gather is used as an argument to
+    // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it
+    // ourselves. We have to create alloca, store to it, and replace all uses
+    // with that.
+    AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+    Value *CastedSrc = Builder.CreateBitOrPointerCast(
+        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+    LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");
+    Value *CastIndex =
+        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+    LLVM_DEBUG(dbgs() << "  Cast index     : " << *CastIndex << ")\n");
+
+    Gather = Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+        {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         CastIndex},
+        nullptr);
+    Value *LoadedResult = Builder.CreateLoad(
+        HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+    LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+    In.replaceAllUsesWith(LoadedResult);
+  } else if (Qual == HvxIdioms::HEX_Gather) {
+    // Gather feeds to another gather but already replaced with
+    // hexagon_V6_vgathermh_128B
+    if (isa<AllocaInst>(IndexLoad)) {
+      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+      if (cstDataVector) {
+        // Our indexes are represented as a constant. We need it in a reg.
+        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
+
+        [[maybe_unused]] auto *StoreIndexes =
+            Builder.CreateStore(cstDataVector, IndexesAlloca);
+        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n");
+        Value *LoadedIndex = Builder.CreateLoad(
+            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
+        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca
+                          << "\n  AddressSpace: "
+                          << ResultAlloca->getAddressSpace() << "\n";);
+
+        Value *CastedSrc = Builder.CreateBitOrPointerCast(
+            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");
+
+        Gather = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+            {ResultAlloca, CastedSrc,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
+            nullptr);
+        Value *LoadedResult = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n");
+        In.replaceAllUsesWith(LoadedResult);
+      }
+    }
+  } else if (Qual == HvxIdioms::LLVM_Gather) {
+    // Gather feeds into another gather
+    errs() << " Underimplemented vgather to vgather sequence\n";
+    return nullptr;
+  } else
+    llvm_unreachable("Unhandled Qual enum");
+
+  return Gather;
+}
+
 auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
                                      const FxpOp &Op) const -> Value * {
   assert(Op.X.Val->getType() == Op.Y.Val->getType());
@@ -2138,6 +3000,26 @@ auto HvxIdioms::run() -> bool {
         It = StartOver ? B.rbegin()
                        : cast<Instruction>(New)->getReverseIterator();
         Changed = true;
+      } else if (matchGather(*It)) {
+        Value *New = processVGather(*It);
+        if (!New)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Gather : " << *New << "\n");
+        // We replace original intrinsic with a new pseudo call.
+        It->eraseFromParent();
+        It = cast<Instruction>(New)->getReverseIterator();
+        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);
+        Changed = true;
+      } else if (matchScatter(*It)) {
+        Value *New = processVScatter(*It);
+        if (!New)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Scatter : " << *New << "\n");
+        // We replace original intrinsic with a new pseudo call.
+        It->eraseFromParent();
+        It = cast<Instruction>(New)->getReverseIterator();
+        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);
+        Changed = true;
       }
     }
   }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 6455757..2f59b7c 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -186,6 +186,9 @@ static unsigned featureToArchVersion(unsigned Feature) {
   case Hexagon::ArchV79:
   case Hexagon::ExtensionHVXV79:
     return 79;
+  case Hexagon::ArchV81:
+  case Hexagon::ExtensionHVXV81:
+    return 81;
   }
   llvm_unreachable("Expected valid arch feature");
   return 0;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 6b48a21..b8075bd 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -96,6 +96,8 @@ cl::opt<bool> MV75("mv75", cl::Hidden, cl::desc("Build for Hexagon V75"),
                    cl::init(false));
 cl::opt<bool> MV79("mv79", cl::Hidden, cl::desc("Build for Hexagon V79"),
                    cl::init(false));
+cl::opt<bool> MV81("mv81", cl::Hidden, cl::desc("Build for Hexagon V81"),
+                   cl::init(false));
 } // namespace
 
 static cl::opt<Hexagon::ArchEnum> EnableHVX(
@@ -111,6 +113,7 @@ static cl::opt<Hexagon::ArchEnum> EnableHVX(
                clEnumValN(Hexagon::ArchEnum::V73, "v73", "Build for HVX v73"),
                clEnumValN(Hexagon::ArchEnum::V75, "v75", "Build for HVX v75"),
                clEnumValN(Hexagon::ArchEnum::V79, "v79", "Build for HVX v79"),
+               clEnumValN(Hexagon::ArchEnum::V81, "v81", "Build for HVX v81"),
                // Sentinel for no value specified.
                clEnumValN(Hexagon::ArchEnum::Generic, "", "")),
     // Sentinel for flag not present.
@@ -159,6 +162,8 @@ static StringRef HexagonGetArchVariant() {
     return "hexagonv75";
   if (MV79)
     return "hexagonv79";
+  if (MV81)
+    return "hexagonv81";
 
   return "";
 }
@@ -474,6 +479,9 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
   case Hexagon::ArchEnum::V79:
     Result.push_back("+hvxv79");
     break;
+  case Hexagon::ArchEnum::V81:
+    Result.push_back("+hvxv81");
+    break;
 
   case Hexagon::ArchEnum::Generic: {
     Result.push_back(StringSwitch<StringRef>(CPU)
@@ -489,7 +497,8 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
                          .Case("hexagonv71t", "+hvxv71")
                          .Case("hexagonv73", "+hvxv73")
                          .Case("hexagonv75", "+hvxv75")
-                         .Case("hexagonv79", "+hvxv79"));
+                         .Case("hexagonv79", "+hvxv79")
+                         .Case("hexagonv81", "+hvxv81"));
     break;
   }
   case Hexagon::ArchEnum::NoArch:
@@ -538,8 +547,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
   FeatureBitset FB = S;
   unsigned CpuArch = ArchV5;
   for (unsigned F :
-       {ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67, ArchV66,
-        ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {
+       {ArchV81, ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67,
+        ArchV66, ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {
     if (!FB.test(F))
       continue;
     CpuArch = F;
@@ -556,7 +565,7 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
   for (unsigned F :
        {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66,
         ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71,
-        ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79}) {
+        ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79, ExtensionHVXV81}) {
     if (!FB.test(F))
       continue;
     HasHvxVer = true;
@@ -569,6 +578,9 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
 
   // HasHvxVer is false, and UseHvx is true.
   switch (CpuArch) {
+  case ArchV81:
+    FB.set(ExtensionHVXV81);
+    [[fallthrough]];
   case ArchV79:
     FB.set(ExtensionHVXV79);
     [[fallthrough]];
@@ -668,12 +680,12 @@ void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI, StringRef FS) {
 
 std::optional<unsigned>
 Hexagon_MC::getHVXVersion(const FeatureBitset &Features) {
-  for (auto Arch : {Hexagon::ExtensionHVXV79, Hexagon::ExtensionHVXV75,
-                    Hexagon::ExtensionHVXV73, Hexagon::ExtensionHVXV71,
-                    Hexagon::ExtensionHVXV69, Hexagon::ExtensionHVXV68,
-                    Hexagon::ExtensionHVXV67, Hexagon::ExtensionHVXV66,
-                    Hexagon::ExtensionHVXV65, Hexagon::ExtensionHVXV62,
-                    Hexagon::ExtensionHVXV60})
+  for (auto Arch : {Hexagon::ExtensionHVXV81, Hexagon::ExtensionHVXV79,
+                    Hexagon::ExtensionHVXV75, Hexagon::ExtensionHVXV73,
+                    Hexagon::ExtensionHVXV71, Hexagon::ExtensionHVXV69,
+                    Hexagon::ExtensionHVXV68, Hexagon::ExtensionHVXV67,
+                    Hexagon::ExtensionHVXV66, Hexagon::ExtensionHVXV65,
+                    Hexagon::ExtensionHVXV62, Hexagon::ExtensionHVXV60})
     if (Features.test(Arch))
       return Arch;
   return {};
@@ -681,13 +693,13 @@ Hexagon_MC::getHVXVersion(const FeatureBitset &Features) {
 
 unsigned Hexagon_MC::getArchVersion(const FeatureBitset &Features) {
   for (auto Arch :
-       {Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73, Hexagon::ArchV71,
-        Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67, Hexagon::ArchV66,
-        Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60, Hexagon::ArchV55,
-        Hexagon::ArchV5})
+       {Hexagon::ArchV81, Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73,
+        Hexagon::ArchV71, Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67,
+        Hexagon::ArchV66, Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60,
+        Hexagon::ArchV55, Hexagon::ArchV5})
     if (Features.test(Arch))
       return Arch;
-  llvm_unreachable("Expected arch v5-v79");
+  llvm_unreachable("Expected arch v5-v81");
   return 0;
 }
 
@@ -708,7 +720,8 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
       .Case("hexagonv71t", llvm::ELF::EF_HEXAGON_MACH_V71T)
       .Case("hexagonv73", llvm::ELF::EF_HEXAGON_MACH_V73)
       .Case("hexagonv75", llvm::ELF::EF_HEXAGON_MACH_V75)
-      .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79);
+      .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79)
+      .Case("hexagonv81", llvm::ELF::EF_HEXAGON_MACH_V81);
 }
 
 llvm::ArrayRef<MCPhysReg> Hexagon_MC::GetVectRegRev() {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index aca7abd..44d1a44 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -4578,6 +4578,8 @@ def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>;
 
 def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>;
 def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>;
+def : InstAlias<"mtpidr $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsISA3_0]>;
+def : InstAlias<"mfpidr $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsISA3_0]>;
 
 foreach SPRG = 4-7 in {
   def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>,
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 9e6b7f0..2754d78 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1124,7 +1124,8 @@ def HasStdExtZbkbOrP
                          "'Base P' (Packed-SIMD)">;
 
 def HasStdExtZbbOrZbkbOrP
-    : Predicate<"Subtarget->HasStdExtZbbOrZbkb()|| Subtarget->hasStdExtP()">,
+    : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || "
+                "Subtarget->hasStdExtP()">,
       AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb, FeatureStdExtP),
                          "'Zbb' (Basic Bit-Manipulation) or "
                          "'Zbkb' (Bitmanip instructions for Cryptography) or "
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 219e3f2..1c930ac 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -318,8 +318,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
 
-  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb() &&
-      !Subtarget.hasVendorXqcibm() && !Subtarget.hasVendorXAndesPerf() &&
+  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasStdExtP() &&
+      !Subtarget.hasVendorXTHeadBb() && !Subtarget.hasVendorXqcibm() &&
+      !Subtarget.hasVendorXAndesPerf() &&
       !(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()))
     setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand);
 
@@ -392,7 +393,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
   }
 
-  if (Subtarget.hasStdExtZbb() ||
+  if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP() ||
       (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
     setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, XLenVT,
                        Legal);
@@ -403,6 +404,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
   } else {
     setOperationAction(ISD::CTTZ, XLenVT, Expand);
+    // If have a CLZW, but not CTZW, custom promote i32.
+    if (Subtarget.hasStdExtP() && Subtarget.is64Bit())
+      setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
   }
 
   if (!Subtarget.hasCPOPLike()) {
@@ -419,13 +423,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     // We need the custom lowering to make sure that the resulting sequence
     // for the 32bit case is efficient on 64bit targets.
     // Use default promotion for i32 without Zbb.
-    if (Subtarget.is64Bit() && Subtarget.hasStdExtZbb())
+    if (Subtarget.is64Bit() &&
+        (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP()))
       setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
   } else {
     setOperationAction(ISD::CTLZ, XLenVT, Expand);
   }
 
-  if (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()) {
+  if (Subtarget.hasStdExtP() ||
+      (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
     setOperationAction(ISD::ABS, XLenVT, Legal);
   } else if (Subtarget.hasShortForwardBranchOpt()) {
     // We can use PseudoCCSUB to implement ABS.
@@ -14669,6 +14675,25 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
         DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
     bool IsCTZ =
         N->getOpcode() == ISD::CTTZ || N->getOpcode() == ISD::CTTZ_ZERO_UNDEF;
+
+    // Without Zbb, lower as 32 - clzw(~X & (X-1))
+    if (IsCTZ && !Subtarget.hasStdExtZbb()) {
+      assert(Subtarget.hasStdExtP());
+
+      NewOp0 = DAG.getFreeze(NewOp0);
+      SDValue Not = DAG.getNOT(DL, NewOp0, MVT::i64);
+      SDValue Minus1 = DAG.getNode(ISD::SUB, DL, MVT::i64, NewOp0,
+                                   DAG.getConstant(1, DL, MVT::i64));
+      SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, Not, Minus1);
+      SDValue CLZW = DAG.getNode(RISCVISD::CLZW, DL, MVT::i64, And);
+      SDValue Sub = DAG.getNode(ISD::SUB, DL, MVT::i64,
+                                DAG.getConstant(32, DL, MVT::i64), CLZW);
+      SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Sub,
+                                DAG.getValueType(MVT::i32));
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+      return;
+    }
+
     unsigned Opc = IsCTZ ? RISCVISD::CTZW : RISCVISD::CLZW;
     SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp0);
     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 7d8a919..cc085bb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -1455,3 +1455,11 @@ let Predicates = [HasStdExtP, IsRV32] in {
   def PMAXU_DW     : RVPPairBinaryExchanged_rr<0b1111, 0b01, "pmaxu.dw">;
   def PMAXU_DB     : RVPPairBinaryExchanged_rr<0b1111, 0b10, "pmaxu.db">;
 } // Predicates = [HasStdExtP, IsRV32]
+
+
+//===----------------------------------------------------------------------===//
+// Codegen patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtP] in
+def : PatGpr<abs, ABS>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 4c2f7f6..f7b4914 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -218,11 +218,13 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0,
 }
 
 let Predicates = [HasVendorXSfvfexpAny], DecoderNamespace = "XSfvector" in {
-  def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">;
+  def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">,
+                   SchedUnaryMC<"WriteSF_VFExp", "ReadSF_VFExp">;
 }
 
 let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in {
-  def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">;
+  def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">,
+                    SchedUnaryMC<"WriteSF_VFExpa", "ReadSF_VFExpa">;
 }
 
 let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector",
@@ -487,6 +489,48 @@ let Predicates = [HasVendorXSfvfnrclipxfqf], AltFmtType = IS_NOT_ALTFMT in {
   defm SF_VFNRCLIP_X_F_QF : VPseudoSiFiveVFNRCLIP;
 }
 
+class VFExpSchedSEWSet<string mx, bit IsBF16, bit IsApprox> {
+  defvar BaseSet = SchedSEWSet<mx, isF=1>.val;
+  list<int> val = !if(IsBF16, !listremove(BaseSet, [32, 64]),
+                      !if(IsApprox, BaseSet, !listremove(BaseSet, [64])));
+}
+multiclass VPseudoVFExp_V<bit IsBF16 = false, bit IsApprox = false> {
+  defvar SchedSuffix = !if(IsApprox, "VFExpa", "VFExp");
+
+  foreach m = MxListF in {
+    defvar mx = m.MX;
+    foreach e = VFExpSchedSEWSet<mx, IsBF16, IsApprox>.val in {
+      let VLMul = m.value in {
+        def "_V_" # mx # "_E" # e
+            : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
+              SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix,
+                         mx, e, forcePassthruRead=true>;
+        def "_V_" # mx # "_E" # e # "_MASK"
+            : VPseudoUnaryMask<m.vrclass, m.vrclass>,
+              RISCVMaskedPseudo<MaskIdx = 2>,
+              SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix,
+                         mx, e, forcePassthruRead=true>;
+      }
+    }
+  }
+}
+
+let Predicates = [HasVendorXSfvfbfexp16e], hasSideEffects = 0 in {
+  let AltFmtType = IS_ALTFMT in {
+    defm PseudoSF_VFEXP_ALT : VPseudoVFExp_V<IsBF16=true>;
+  }
+}
+
+let Predicates = [HasVendorXSfvfexpAnyFloat], hasSideEffects = 0 in {
+  let AltFmtType = IS_NOT_ALTFMT in {
+    defm PseudoSF_VFEXP : VPseudoVFExp_V;
+  }
+}
+
+let Predicates = [HasVendorXSfvfexpa], AltFmtType = IS_NOT_ALTFMT in {
+  defm PseudoSF_VFEXPA : VPseudoVFExp_V<IsApprox=true>;
+}
+
 // SDNode
 def SDT_SF_VC_V_X : SDTypeProfile<1, 4, [SDTCisVec<0>,
                                          SDTCisVT<1, XLenVT>,
@@ -893,3 +937,36 @@ let Predicates = [HasVendorXSfcease] in {
     let rs2 = 0b00101;
 }
 }
+
+let Predicates = [HasVendorXSfvfbfexp16e] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP_ALT",
+                      AllBF16Vectors,
+                      isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexp16e] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP",
+                      [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8],
+                      isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexp32e] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP",
+                      [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexpa] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA",
+                      [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexpa, HasVInstructionsF16] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA",
+                      [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8],
+                      isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexpa64e] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA",
+                      [VF64M1, VF64M2, VF64M4, VF64M8], isSEWAware=1>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 6b9a75f..5429c2a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -599,14 +599,20 @@ def : PatGpr<riscv_zip, ZIP_RV32, i32>;
 def : PatGpr<riscv_unzip, UNZIP_RV32, i32>;
 } // Predicates = [HasStdExtZbkb, IsRV32]
 
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrP] in {
 def : PatGpr<ctlz, CLZ>;
+}
+
+let Predicates = [HasStdExtZbb] in {
 def : PatGpr<cttz, CTZ>;
 def : PatGpr<ctpop, CPOP>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbb, IsRV64] in {
+let Predicates = [HasStdExtZbbOrP, IsRV64] in {
 def : PatGpr<riscv_clzw, CLZW>;
+}
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
 def : PatGpr<riscv_ctzw, CTZW>;
 def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>;
 
@@ -614,22 +620,22 @@ def : Pat<(i64 (riscv_negw_max GPR:$rs1)),
           (MAX GPR:$rs1, (XLenVT (SUBW (XLenVT X0), GPR:$rs1)))>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrP] in {
 def : Pat<(XLenVT (sext_inreg GPR:$rs1, i8)), (SEXT_B GPR:$rs1)>;
 def : Pat<(XLenVT (sext_inreg GPR:$rs1, i16)), (SEXT_H GPR:$rs1)>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrP] in {
 def : PatGprGpr<smin, MIN>;
 def : PatGprGpr<smax, MAX>;
 def : PatGprGpr<umin, MINU>;
 def : PatGprGpr<umax, MAXU>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in
+let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] in
 def : PatGpr<bswap, REV8_RV32, i32>;
 
-let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in
+let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] in
 def : PatGpr<bswap, REV8_RV64, i64>;
 
 let Predicates = [HasStdExtZbkb] in {
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 637d61fe..36a2f46 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -1588,6 +1588,10 @@ multiclass SiFive7SchedResources<int vlen, bit dualVALU,
   //===----------------------------------------------------------------------===//
   // Unsupported extensions
   defm : UnsupportedSchedQ;
+  // TODO: scheduling info of XSfvfexp* and XSfvfexpa*
+  // for SiFive7 will be added in follow-up patches.
+  defm : UnsupportedSchedXSfvfexp;
+  defm : UnsupportedSchedXSfvfexpa;
   defm : UnsupportedSchedZabha;
   defm : UnsupportedSchedZbc;
   defm : UnsupportedSchedZbkb;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index 9ab9636..64ccfd8 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -523,6 +523,8 @@ include "RISCVScheduleZvk.td"
 // Vendor Extensions
 multiclass UnsupportedSchedXsf {
   defm : UnsupportedSchedXsfvcp;
+  defm : UnsupportedSchedXSfvfexp;
+  defm : UnsupportedSchedXSfvfexpa;
   defm : UnsupportedSchedXSfvfnrclipxfqf;
   defm : UnsupportedSchedXSfvfwmaccqqq;
   defm : UnsupportedSchedXSfvqmaccdod;
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
index 99632e4..1ee6dc1 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
@@ -99,3 +99,23 @@ defm : LMULWriteRes<"WriteSF_VFWMACC_QQQ", []>;
 defm : LMULReadAdvance<"ReadSF_VFWMACC_QQQ", 0>;
 } // Unsupported = true
 }
+
+defm "" : LMULSEWSchedWritesF<"WriteSF_VFExp">;
+defm "" : LMULSEWSchedReadsF<"ReadSF_VFExp">;
+
+multiclass UnsupportedSchedXSfvfexp {
+let Unsupported = true in {
+defm : LMULSEWWriteResF<"WriteSF_VFExp", []>;
+defm : LMULSEWReadAdvanceF<"ReadSF_VFExp", 0>;
+} // Unsupported = true
+}
+
+defm "" : LMULSEWSchedWritesF<"WriteSF_VFExpa">;
+defm "" : LMULSEWSchedReadsF<"ReadSF_VFExpa">;
+
+multiclass UnsupportedSchedXSfvfexpa {
+let Unsupported = true in {
+defm : LMULSEWWriteResF<"WriteSF_VFExpa", []>;
+defm : LMULSEWReadAdvanceF<"ReadSF_VFExpa", 0>;
+} // Unsupported = true
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 334db4b..4b4fc8f 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -187,7 +187,7 @@ public:
   }
 
   bool hasCLZLike() const {
-    return HasStdExtZbb || HasVendorXTHeadBb ||
+    return HasStdExtZbb || HasStdExtP || HasVendorXTHeadBb ||
            (HasVendorXCVbitmanip && !IsRV64);
   }
   bool hasCTZLike() const {
@@ -197,7 +197,7 @@ public:
     return HasStdExtZbb || (HasVendorXCVbitmanip && !IsRV64);
   }
   bool hasREV8Like() const {
-    return HasStdExtZbb || HasStdExtZbkb || HasVendorXTHeadBb;
+    return HasStdExtZbb || HasStdExtZbkb || HasStdExtP || HasVendorXTHeadBb;
   }
 
   bool hasBEXTILike() const { return HasStdExtZbs || HasVendorXTHeadBs; }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 669d4f0..8d9933b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -582,6 +582,18 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
           IC.Builder.CreateBinaryIntrinsic(Intrinsic::ctlz, C, Op1);
       return BinaryOperator::CreateSub(ConstCtlz, X);
     }
+
+    // ctlz(~x & (x - 1)) -> bitwidth - cttz(x, false)
+    if (Op0->hasOneUse() &&
+        match(Op0,
+              m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) {
+      Type *Ty = II.getType();
+      unsigned BitWidth = Ty->getScalarSizeInBits();
+      auto *Cttz = IC.Builder.CreateIntrinsic(Intrinsic::cttz, Ty,
+                                              {X, IC.Builder.getFalse()});
+      auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth));
+      return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz));
+    }
   }
 
   // cttz(Pow2) -> Log2(Pow2)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 5aa8de3..f5130da 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -4697,5 +4697,31 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
                 cast<IntrinsicInst>(TrueVal)->getParamAlign(0).valueOrOne(),
                 CondVal, FalseVal));
 
+  // Canonicalize sign function ashr pattern: select (icmp slt X, 1), ashr X,
+  // bitwidth-1, 1 -> scmp(X, 0)
+  // Also handles: select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0)
+  unsigned BitWidth = SI.getType()->getScalarSizeInBits();
+  CmpPredicate Pred;
+  Value *CmpLHS, *CmpRHS;
+
+  // Canonicalize sign function ashr patterns:
+  // select (icmp slt X, 1), ashr X, bitwidth-1, 1 -> scmp(X, 0)
+  // select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0)
+  if (match(&SI, m_Select(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
+                          m_Value(TrueVal), m_Value(FalseVal))) &&
+      ((Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_One()) &&
+        match(TrueVal,
+              m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1))) &&
+        match(FalseVal, m_One())) ||
+       (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_Zero()) &&
+        match(TrueVal, m_One()) &&
+        match(FalseVal,
+              m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1)))))) {
+
+    Function *Scmp = Intrinsic::getOrInsertDeclaration(
+        SI.getModule(), Intrinsic::scmp, {SI.getType(), SI.getType()});
+    return CallInst::Create(Scmp, {CmpLHS, ConstantInt::get(SI.getType(), 0)});
+  }
+
   return nullptr;
 }
diff --git a/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
new file mode 100644
index 0000000..64fad37
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 | FileCheck %s
+
+; for (i = 0; i < 3; i++) {
+;     a[-k * i] = 1;
+;     a[-k * i + (2 * k + 1)] = 2;
+; }
+;
+; When k = -1, dependency exists between the two stores. Accesses will be:
+;
+;   - a[-k * i]               : a[ 0], a[-1], a[-2]
+;   - a[-k * i + (2 * k + 1)] : a[-1], a[-2], a[-3]
+;
+; We cannot determine the sign of `k` and `2*k + 1` at compile time,
+;
+define void @unknown_sign(ptr %a, i64 %k) {
+; CHECK-LABEL: 'unknown_sign'
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - output [<>]!
+; CHECK-NEXT:  Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  %k.neg = sub nsw i64 0, %k
+  %kk = mul nsw i64 %k, 2
+  %subscript.1.init = add i64 1, %kk
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %subscript.0 = phi i64 [ 0, %entry ], [ %subscript.0.next, %loop ]
+  %subscript.1 = phi i64 [ %subscript.1.init, %entry ], [ %subscript.1.next, %loop ]
+  %idx.0 = getelementptr i8, ptr %a, i64 %subscript.0
+  %idx.1 = getelementptr i8, ptr %a, i64 %subscript.1
+  store i8 1, ptr %idx.0
+  store i8 2, ptr %idx.1
+  %i.next = add i64 %i, 1
+  %subscript.0.next = add nsw i64 %subscript.0, %k.neg
+  %subscript.1.next = add nsw i64 %subscript.1, %k.neg
+  %cond.exit = icmp eq i64 %i.next, 3
+  br i1 %cond.exit, label %exit, label %loop
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Assembler/metadata-annotations.ll b/llvm/test/Assembler/metadata-annotations.ll
new file mode 100644
index 0000000..4fd4713
--- /dev/null
+++ b/llvm/test/Assembler/metadata-annotations.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-as < %s | llvm-dis --materialize-metadata --show-annotations | FileCheck %s
+
+; CHECK: ; Materializable
+; CHECK-NEXT: define dso_local i32 @test() {}
+define dso_local i32 @test() {
+entry:
+  ret i32 0
+}
+
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 0933e67..b54f262 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -749,12 +749,429 @@ for.body:                                         ; preds = %for.body.preheader1
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none) %A, i8 noundef %B, i32 noundef %n) {
+; CHECK-SD-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    cbz w2, .LBB6_3
+; CHECK-SD-NEXT:  // %bb.1: // %iter.check
+; CHECK-SD-NEXT:    str x25, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -64
+; CHECK-SD-NEXT:    sxtb x9, w1
+; CHECK-SD-NEXT:    cmp w2, #3
+; CHECK-SD-NEXT:    mov w10, w2
+; CHECK-SD-NEXT:    b.hi .LBB6_4
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x8, xzr
+; CHECK-SD-NEXT:    b .LBB6_13
+; CHECK-SD-NEXT:  .LBB6_3:
+; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB6_4: // %vector.main.loop.iter.check
+; CHECK-SD-NEXT:    dup v0.2d, x9
+; CHECK-SD-NEXT:    cmp w2, #16
+; CHECK-SD-NEXT:    b.hs .LBB6_6
+; CHECK-SD-NEXT:  // %bb.5:
+; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x8, xzr
+; CHECK-SD-NEXT:    b .LBB6_10
+; CHECK-SD-NEXT:  .LBB6_6: // %vector.ph
+; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    and x12, x10, #0xc
+; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x11, x10, #0xfffffff0
+; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x15, x0
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x16, x10, #0xfffffff0
+; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-SD-NEXT:    fmov x13, d0
+; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:  .LBB6_7: // %vector.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldr q17, [x15], #16
+; CHECK-SD-NEXT:    subs x16, x16, #16
+; CHECK-SD-NEXT:    ushll v18.8h, v17.8b, #0
+; CHECK-SD-NEXT:    ushll2 v19.8h, v17.16b, #0
+; CHECK-SD-NEXT:    ushll v17.4s, v18.4h, #0
+; CHECK-SD-NEXT:    ushll2 v20.4s, v19.8h, #0
+; CHECK-SD-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-SD-NEXT:    ushll v19.4s, v19.4h, #0
+; CHECK-SD-NEXT:    ushll v21.2d, v17.2s, #0
+; CHECK-SD-NEXT:    ushll2 v22.2d, v20.4s, #0
+; CHECK-SD-NEXT:    ushll2 v17.2d, v17.4s, #0
+; CHECK-SD-NEXT:    ushll v23.2d, v18.2s, #0
+; CHECK-SD-NEXT:    ushll v20.2d, v20.2s, #0
+; CHECK-SD-NEXT:    ushll2 v18.2d, v18.4s, #0
+; CHECK-SD-NEXT:    fmov x17, d21
+; CHECK-SD-NEXT:    mov x2, v21.d[1]
+; CHECK-SD-NEXT:    ushll v21.2d, v19.2s, #0
+; CHECK-SD-NEXT:    ushll2 v19.2d, v19.4s, #0
+; CHECK-SD-NEXT:    fmov x18, d22
+; CHECK-SD-NEXT:    fmov x1, d17
+; CHECK-SD-NEXT:    fmov x3, d23
+; CHECK-SD-NEXT:    fmov x21, d20
+; CHECK-SD-NEXT:    fmov x22, d18
+; CHECK-SD-NEXT:    fmov x19, d21
+; CHECK-SD-NEXT:    mul x17, x13, x17
+; CHECK-SD-NEXT:    mov x4, v22.d[1]
+; CHECK-SD-NEXT:    fmov x24, d19
+; CHECK-SD-NEXT:    mov x5, v23.d[1]
+; CHECK-SD-NEXT:    mov x6, v21.d[1]
+; CHECK-SD-NEXT:    mov x7, v20.d[1]
+; CHECK-SD-NEXT:    mov x20, v18.d[1]
+; CHECK-SD-NEXT:    mov x23, v19.d[1]
+; CHECK-SD-NEXT:    mov x25, v17.d[1]
+; CHECK-SD-NEXT:    mul x18, x14, x18
+; CHECK-SD-NEXT:    mul x1, x13, x1
+; CHECK-SD-NEXT:    fmov d17, x17
+; CHECK-SD-NEXT:    mul x3, x13, x3
+; CHECK-SD-NEXT:    fmov d18, x18
+; CHECK-SD-NEXT:    mul x19, x13, x19
+; CHECK-SD-NEXT:    fmov d19, x1
+; CHECK-SD-NEXT:    mul x21, x13, x21
+; CHECK-SD-NEXT:    fmov d20, x3
+; CHECK-SD-NEXT:    mul x22, x13, x22
+; CHECK-SD-NEXT:    fmov d21, x19
+; CHECK-SD-NEXT:    mul x24, x13, x24
+; CHECK-SD-NEXT:    fmov d24, x21
+; CHECK-SD-NEXT:    mul x2, x8, x2
+; CHECK-SD-NEXT:    fmov d22, x22
+; CHECK-SD-NEXT:    mul x4, x8, x4
+; CHECK-SD-NEXT:    fmov d23, x24
+; CHECK-SD-NEXT:    mul x5, x8, x5
+; CHECK-SD-NEXT:    mov v17.d[1], x2
+; CHECK-SD-NEXT:    mul x6, x8, x6
+; CHECK-SD-NEXT:    mov v18.d[1], x4
+; CHECK-SD-NEXT:    mul x7, x8, x7
+; CHECK-SD-NEXT:    mov v20.d[1], x5
+; CHECK-SD-NEXT:    add v1.2d, v17.2d, v1.2d
+; CHECK-SD-NEXT:    mul x20, x8, x20
+; CHECK-SD-NEXT:    mov v21.d[1], x6
+; CHECK-SD-NEXT:    add v6.2d, v18.2d, v6.2d
+; CHECK-SD-NEXT:    mul x23, x8, x23
+; CHECK-SD-NEXT:    mov v24.d[1], x7
+; CHECK-SD-NEXT:    add v4.2d, v20.2d, v4.2d
+; CHECK-SD-NEXT:    mul x17, x8, x25
+; CHECK-SD-NEXT:    mov v22.d[1], x20
+; CHECK-SD-NEXT:    add v7.2d, v21.2d, v7.2d
+; CHECK-SD-NEXT:    mov v23.d[1], x23
+; CHECK-SD-NEXT:    add v16.2d, v24.2d, v16.2d
+; CHECK-SD-NEXT:    mov v19.d[1], x17
+; CHECK-SD-NEXT:    add v3.2d, v22.2d, v3.2d
+; CHECK-SD-NEXT:    add v5.2d, v23.2d, v5.2d
+; CHECK-SD-NEXT:    add v2.2d, v19.2d, v2.2d
+; CHECK-SD-NEXT:    b.ne .LBB6_7
+; CHECK-SD-NEXT:  // %bb.8: // %middle.block
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v7.2d
+; CHECK-SD-NEXT:    add v4.2d, v4.2d, v16.2d
+; CHECK-SD-NEXT:    cmp x11, x10
+; CHECK-SD-NEXT:    add v2.2d, v2.2d, v5.2d
+; CHECK-SD-NEXT:    add v3.2d, v3.2d, v6.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v4.2d
+; CHECK-SD-NEXT:    add v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    addp d1, v1.2d
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    b.eq .LBB6_15
+; CHECK-SD-NEXT:  // %bb.9: // %vec.epilog.iter.check
+; CHECK-SD-NEXT:    cbz x12, .LBB6_13
+; CHECK-SD-NEXT:  .LBB6_10: // %vec.epilog.ph
+; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x13, x11
+; CHECK-SD-NEXT:    movi v3.2d, #0x000000000000ff
+; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:    and x11, x10, #0xfffffffc
+; CHECK-SD-NEXT:    fmov x15, d0
+; CHECK-SD-NEXT:    sub x12, x13, x11
+; CHECK-SD-NEXT:    add x13, x0, x13
+; CHECK-SD-NEXT:    mov v1.d[0], x8
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:  .LBB6_11: // %vec.epilog.vector.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldr s0, [x13], #4
+; CHECK-SD-NEXT:    adds x12, x12, #4
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    and v4.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT:    fmov x16, d4
+; CHECK-SD-NEXT:    fmov x18, d0
+; CHECK-SD-NEXT:    mov x17, v4.d[1]
+; CHECK-SD-NEXT:    mov x1, v0.d[1]
+; CHECK-SD-NEXT:    mul x16, x14, x16
+; CHECK-SD-NEXT:    mul x18, x15, x18
+; CHECK-SD-NEXT:    mul x17, x8, x17
+; CHECK-SD-NEXT:    fmov d0, x16
+; CHECK-SD-NEXT:    mul x1, x8, x1
+; CHECK-SD-NEXT:    fmov d4, x18
+; CHECK-SD-NEXT:    mov v0.d[1], x17
+; CHECK-SD-NEXT:    mov v4.d[1], x1
+; CHECK-SD-NEXT:    add v1.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-SD-NEXT:    b.ne .LBB6_11
+; CHECK-SD-NEXT:  // %bb.12: // %vec.epilog.middle.block
+; CHECK-SD-NEXT:    add v0.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    cmp x11, x10
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    b.eq .LBB6_15
+; CHECK-SD-NEXT:  .LBB6_13: // %for.body.preheader
+; CHECK-SD-NEXT:    sub x10, x10, x11
+; CHECK-SD-NEXT:    add x11, x0, x11
+; CHECK-SD-NEXT:  .LBB6_14: // %for.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldrb w12, [x11], #1
+; CHECK-SD-NEXT:    subs x10, x10, #1
+; CHECK-SD-NEXT:    smaddl x8, w12, w9, x8
+; CHECK-SD-NEXT:    b.ne .LBB6_14
+; CHECK-SD-NEXT:  .LBB6_15:
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x25, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    cbz w2, .LBB6_7
+; CHECK-GI-NEXT:  // %bb.1: // %iter.check
+; CHECK-GI-NEXT:    movi d0, #0000000000000000
+; CHECK-GI-NEXT:    sxtb x9, w1
+; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    cmp w2, #4
+; CHECK-GI-NEXT:    mov w10, w2
+; CHECK-GI-NEXT:    b.lo .LBB6_12
+; CHECK-GI-NEXT:  // %bb.2: // %vector.main.loop.iter.check
+; CHECK-GI-NEXT:    movi d0, #0000000000000000
+; CHECK-GI-NEXT:    dup v1.2d, x9
+; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    cmp w2, #16
+; CHECK-GI-NEXT:    b.lo .LBB6_9
+; CHECK-GI-NEXT:  // %bb.3: // %vector.ph
+; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT:    xtn v2.2s, v1.2d
+; CHECK-GI-NEXT:    and x8, x10, #0xc
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-NEXT:    and x11, x10, #0xfffffff0
+; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-GI-NEXT:    mov x12, x0
+; CHECK-GI-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-GI-NEXT:    and x13, x10, #0xfffffff0
+; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-GI-NEXT:  .LBB6_4: // %vector.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldr q18, [x12], #16
+; CHECK-GI-NEXT:    subs x13, x13, #16
+; CHECK-GI-NEXT:    ushll v19.8h, v18.8b, #0
+; CHECK-GI-NEXT:    ushll2 v18.8h, v18.16b, #0
+; CHECK-GI-NEXT:    ushll v20.4s, v19.4h, #0
+; CHECK-GI-NEXT:    ushll2 v19.4s, v19.8h, #0
+; CHECK-GI-NEXT:    ushll v21.4s, v18.4h, #0
+; CHECK-GI-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-GI-NEXT:    mov d22, v20.d[1]
+; CHECK-GI-NEXT:    mov d23, v19.d[1]
+; CHECK-GI-NEXT:    mov d24, v21.d[1]
+; CHECK-GI-NEXT:    mov d25, v18.d[1]
+; CHECK-GI-NEXT:    smlal v0.2d, v2.2s, v20.2s
+; CHECK-GI-NEXT:    smlal v4.2d, v2.2s, v19.2s
+; CHECK-GI-NEXT:    smlal v6.2d, v2.2s, v21.2s
+; CHECK-GI-NEXT:    smlal v16.2d, v2.2s, v18.2s
+; CHECK-GI-NEXT:    smlal v3.2d, v2.2s, v22.2s
+; CHECK-GI-NEXT:    smlal v5.2d, v2.2s, v23.2s
+; CHECK-GI-NEXT:    smlal v7.2d, v2.2s, v24.2s
+; CHECK-GI-NEXT:    smlal v17.2d, v2.2s, v25.2s
+; CHECK-GI-NEXT:    b.ne .LBB6_4
+; CHECK-GI-NEXT:  // %bb.5: // %middle.block
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    add v2.2d, v4.2d, v5.2d
+; CHECK-GI-NEXT:    cmp x11, x10
+; CHECK-GI-NEXT:    add v3.2d, v6.2d, v7.2d
+; CHECK-GI-NEXT:    add v4.2d, v16.2d, v17.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    add v2.2d, v3.2d, v4.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    b.ne .LBB6_8
+; CHECK-GI-NEXT:  // %bb.6:
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB6_7:
+; CHECK-GI-NEXT:    mov x8, xzr
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB6_8: // %vec.epilog.iter.check
+; CHECK-GI-NEXT:    cbz x8, .LBB6_12
+; CHECK-GI-NEXT:  .LBB6_9: // %vec.epilog.ph
+; CHECK-GI-NEXT:    mov v0.d[1], xzr
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    mov x12, x11
+; CHECK-GI-NEXT:    xtn v1.2s, v1.2d
+; CHECK-GI-NEXT:    and x11, x10, #0xfffffffc
+; CHECK-GI-NEXT:    sub x8, x12, x11
+; CHECK-GI-NEXT:    add x12, x0, x12
+; CHECK-GI-NEXT:  .LBB6_10: // %vec.epilog.vector.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldr w13, [x12], #4
+; CHECK-GI-NEXT:    adds x8, x8, #4
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    uxtb w13, w13
+; CHECK-GI-NEXT:    mov b4, v3.b[2]
+; CHECK-GI-NEXT:    mov b5, v3.b[1]
+; CHECK-GI-NEXT:    mov b6, v3.b[3]
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    fmov w14, s4
+; CHECK-GI-NEXT:    fmov w15, s5
+; CHECK-GI-NEXT:    fmov w16, s6
+; CHECK-GI-NEXT:    uxtb w14, w14
+; CHECK-GI-NEXT:    uxtb w15, w15
+; CHECK-GI-NEXT:    uxtb w16, w16
+; CHECK-GI-NEXT:    fmov s4, w14
+; CHECK-GI-NEXT:    mov v3.s[1], w15
+; CHECK-GI-NEXT:    mov v4.s[1], w16
+; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v3.2s
+; CHECK-GI-NEXT:    smlal v2.2d, v1.2s, v4.2s
+; CHECK-GI-NEXT:    b.ne .LBB6_10
+; CHECK-GI-NEXT:  // %bb.11: // %vec.epilog.middle.block
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    cmp x11, x10
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    b.eq .LBB6_14
+; CHECK-GI-NEXT:  .LBB6_12: // %for.body.preheader
+; CHECK-GI-NEXT:    sub x10, x10, x11
+; CHECK-GI-NEXT:    add x11, x0, x11
+; CHECK-GI-NEXT:  .LBB6_13: // %for.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldrb w8, [x11], #1
+; CHECK-GI-NEXT:    fmov x12, d0
+; CHECK-GI-NEXT:    subs x10, x10, #1
+; CHECK-GI-NEXT:    madd x8, x8, x9, x12
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    b.ne .LBB6_13
+; CHECK-GI-NEXT:  .LBB6_14: // %for.cond.cleanup
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp5.not = icmp eq i32 %n, 0
+  br i1 %cmp5.not, label %for.cond.cleanup, label %iter.check
+
+iter.check:                                       ; preds = %entry
+  %conv1 = sext i8 %B to i64
+  %wide.trip.count = zext i32 %n to i64
+  %min.iters.check = icmp ult i32 %n, 4
+  br i1 %min.iters.check, label %for.body.preheader, label %vector.main.loop.iter.check
+
+vector.main.loop.iter.check:                      ; preds = %iter.check
+  %min.iters.check9 = icmp ult i32 %n, 16
+  br i1 %min.iters.check9, label %vec.epilog.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.main.loop.iter.check
+  %n.mod.vf = and i64 %wide.trip.count, 12
+  %n.vec = and i64 %wide.trip.count, 4294967280
+  %broadcast.splatinsert = insertelement <16 x i64> poison, i64 %conv1, i64 0
+  %broadcast.splat = shufflevector <16 x i64> %broadcast.splatinsert, <16 x i64> poison, <16 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <16 x i64> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+  %0 = getelementptr inbounds nuw i8, ptr %A, i64 %index
+  %wide.load = load <16 x i8>, ptr %0, align 1
+  %1 = zext <16 x i8> %wide.load to <16 x i64>
+  %2 = mul nsw <16 x i64> %broadcast.splat, %1
+  %3 = add <16 x i64> %2, %vec.phi
+  %index.next = add nuw i64 %index, 16
+  %4 = icmp eq i64 %index.next, %n.vec
+  br i1 %4, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %5 = tail call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %3)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %vec.epilog.iter.check
+
+vec.epilog.iter.check:                            ; preds = %middle.block
+  %min.epilog.iters.check = icmp eq i64 %n.mod.vf, 0
+  br i1 %min.epilog.iters.check, label %for.body.preheader, label %vec.epilog.ph
+
+vec.epilog.ph:                                    ; preds = %vector.main.loop.iter.check, %vec.epilog.iter.check
+  %vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %bc.merge.rdx = phi i64 [ %5, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %n.vec11 = and i64 %wide.trip.count, 4294967292
+  %6 = insertelement <4 x i64> <i64 poison, i64 0, i64 0, i64 0>, i64 %bc.merge.rdx, i64 0
+  %broadcast.splatinsert12 = insertelement <4 x i64> poison, i64 %conv1, i64 0
+  %broadcast.splat13 = shufflevector <4 x i64> %broadcast.splatinsert12, <4 x i64> poison, <4 x i32> zeroinitializer
+  br label %vec.epilog.vector.body
+
+vec.epilog.vector.body:                           ; preds = %vec.epilog.vector.body, %vec.epilog.ph
+  %index14 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next17, %vec.epilog.vector.body ]
+  %vec.phi15 = phi <4 x i64> [ %6, %vec.epilog.ph ], [ %10, %vec.epilog.vector.body ]
+  %7 = getelementptr inbounds nuw i8, ptr %A, i64 %index14
+  %wide.load16 = load <4 x i8>, ptr %7, align 1
+  %8 = zext <4 x i8> %wide.load16 to <4 x i64>
+  %9 = mul nsw <4 x i64> %broadcast.splat13, %8
+  %10 = add <4 x i64> %9, %vec.phi15
+  %index.next17 = add nuw i64 %index14, 4
+  %11 = icmp eq i64 %index.next17, %n.vec11
+  br i1 %11, label %vec.epilog.middle.block, label %vec.epilog.vector.body
+
+vec.epilog.middle.block:                          ; preds = %vec.epilog.vector.body
+  %12 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %10)
+  %cmp.n18 = icmp eq i64 %n.vec11, %wide.trip.count
+  br i1 %cmp.n18, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %iter.check, %vec.epilog.iter.check, %vec.epilog.middle.block
+  %indvars.iv.ph = phi i64 [ 0, %iter.check ], [ %n.vec, %vec.epilog.iter.check ], [ %n.vec11, %vec.epilog.middle.block ]
+  %s.06.ph = phi i64 [ 0, %iter.check ], [ %5, %vec.epilog.iter.check ], [ %12, %vec.epilog.middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %vec.epilog.middle.block, %entry
+  %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %middle.block ], [ %12, %vec.epilog.middle.block ], [ %add, %for.body ]
+  ret i64 %s.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ]
+  %s.06 = phi i64 [ %add, %for.body ], [ %s.06.ph, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %A, i64 %indvars.iv
+  %13 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %13 to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %add = add nsw i64 %mul, %s.06
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
 define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-LABEL: sink_v2z64_1:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:  .LBB6_1: // %loop
+; CHECK-SD-NEXT:  .LBB7_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr d1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -762,7 +1179,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-NEXT:    umull v1.2d, v1.2s, v0.s[1]
 ; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-SD-NEXT:    str d1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB6_1
+; CHECK-SD-NEXT:    b.ne .LBB7_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -772,7 +1189,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.2d, v0.d[1]
 ; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
-; CHECK-GI-NEXT:  .LBB6_1: // %loop
+; CHECK-GI-NEXT:  .LBB7_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr d1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -780,7 +1197,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    umull v1.2d, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-GI-NEXT:    str d1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB6_1
+; CHECK-GI-NEXT:    b.ne .LBB7_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -813,7 +1230,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:  .LBB7_1: // %loop
+; CHECK-SD-NEXT:  .LBB8_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr q1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -823,7 +1240,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #15
 ; CHECK-SD-NEXT:    shrn2 v2.4s, v1.2d, #15
 ; CHECK-SD-NEXT:    str q2, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB7_1
+; CHECK-SD-NEXT:    b.ne .LBB8_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -833,7 +1250,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.2d, v0.d[1]
 ; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
-; CHECK-GI-NEXT:  .LBB7_1: // %loop
+; CHECK-GI-NEXT:  .LBB8_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr q1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -844,7 +1261,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-GI-NEXT:    shrn2 v1.4s, v2.2d, #15
 ; CHECK-GI-NEXT:    str q1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB7_1
+; CHECK-GI-NEXT:    b.ne .LBB8_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -877,7 +1294,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    dup v0.8b, v0.b[0]
 ; CHECK-SD-NEXT:    mov x8, xzr
-; CHECK-SD-NEXT:  .LBB8_1: // %loop
+; CHECK-SD-NEXT:  .LBB9_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr d1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -886,7 +1303,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-SD-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-SD-NEXT:    str d1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB8_1
+; CHECK-SD-NEXT:    b.ne .LBB9_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -896,7 +1313,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.8h, v0.h[0]
 ; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
-; CHECK-GI-NEXT:  .LBB8_1: // %loop
+; CHECK-GI-NEXT:  .LBB9_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr d1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -905,7 +1322,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-GI-NEXT:    str d1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB8_1
+; CHECK-GI-NEXT:    b.ne .LBB9_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -938,7 +1355,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    dup v0.16b, v0.b[10]
 ; CHECK-SD-NEXT:    mov x8, xzr
-; CHECK-SD-NEXT:  .LBB9_1: // %loop
+; CHECK-SD-NEXT:  .LBB10_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr q1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -949,7 +1366,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD-NEXT:    cmlt v2.8h, v2.8h, #0
 ; CHECK-SD-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    str q1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB9_1
+; CHECK-SD-NEXT:    b.ne .LBB10_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -959,7 +1376,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.8h, v0.h[2]
 ; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
-; CHECK-GI-NEXT:  .LBB9_1: // %loop
+; CHECK-GI-NEXT:  .LBB10_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr q1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -971,7 +1388,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    cmlt v2.8h, v2.8h, #0
 ; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    str q1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB9_1
+; CHECK-GI-NEXT:    b.ne .LBB10_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1005,7 +1422,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-SD-NEXT:    dup v0.4h, w3
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
-; CHECK-SD-NEXT:  .LBB10_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB11_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #8
@@ -1015,7 +1432,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    stp q1, q2, [x9]
-; CHECK-SD-NEXT:    b.ne .LBB10_1
+; CHECK-SD-NEXT:    b.ne .LBB11_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1026,7 +1443,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:  .LBB10_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB11_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #8
@@ -1036,7 +1453,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-GI-NEXT:    stp q1, q2, [x9]
-; CHECK-GI-NEXT:    b.ne .LBB10_1
+; CHECK-GI-NEXT:    b.ne .LBB11_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1089,7 +1506,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-SD-NEXT:    dup v0.8h, w3
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff0
-; CHECK-SD-NEXT:  .LBB11_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB12_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #16
@@ -1103,7 +1520,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    stp q1, q3, [x9]
 ; CHECK-SD-NEXT:    stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT:    b.ne .LBB11_1
+; CHECK-SD-NEXT:    b.ne .LBB12_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1114,7 +1531,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff0
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:  .LBB11_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB12_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #16
@@ -1130,7 +1547,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-GI-NEXT:    umull v4.4s, v0.4h, v4.4h
 ; CHECK-GI-NEXT:    stp q1, q3, [x9]
 ; CHECK-GI-NEXT:    stp q2, q4, [x9, #32]!
-; CHECK-GI-NEXT:    b.ne .LBB11_1
+; CHECK-GI-NEXT:    b.ne .LBB12_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1184,7 +1601,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
 ; CHECK-SD-NEXT:    fmov s0, w9
-; CHECK-SD-NEXT:  .LBB12_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB13_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #8
@@ -1196,7 +1613,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-SD-NEXT:    mul v1.4s, v1.4s, v0.s[0]
 ; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v0.s[0]
 ; CHECK-SD-NEXT:    stp q1, q2, [x9]
-; CHECK-SD-NEXT:    b.ne .LBB12_1
+; CHECK-SD-NEXT:    b.ne .LBB13_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1206,7 +1623,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
-; CHECK-GI-NEXT:  .LBB12_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB13_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #8
@@ -1218,7 +1635,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-GI-NEXT:    mul v1.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    stp q1, q2, [x9]
-; CHECK-GI-NEXT:    b.ne .LBB12_1
+; CHECK-GI-NEXT:    b.ne .LBB13_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1272,7 +1689,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff0
 ; CHECK-SD-NEXT:    fmov s0, w9
-; CHECK-SD-NEXT:  .LBB13_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB14_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #16
@@ -1290,7 +1707,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v0.s[0]
 ; CHECK-SD-NEXT:    stp q1, q3, [x9]
 ; CHECK-SD-NEXT:    stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT:    b.ne .LBB13_1
+; CHECK-SD-NEXT:    b.ne .LBB14_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1300,7 +1717,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff0
-; CHECK-GI-NEXT:  .LBB13_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB14_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #16
@@ -1318,7 +1735,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    stp q3, q1, [x9]
 ; CHECK-GI-NEXT:    stp q4, q2, [x9, #32]!
-; CHECK-GI-NEXT:    b.ne .LBB13_1
+; CHECK-GI-NEXT:    b.ne .LBB14_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1369,9 +1786,9 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
 ; CHECK-SD-LABEL: cmplx_mul_combined_re_im:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    lsr x9, x0, #16
-; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
+; CHECK-SD-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-SD-NEXT:    dup v4.8h, w0
-; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI15_0]
 ; CHECK-SD-NEXT:    dup v2.8h, w9
 ; CHECK-SD-NEXT:    sqneg v1.8h, v2.8h
 ; CHECK-SD-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v3.16b
@@ -1386,12 +1803,12 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
 ; CHECK-GI-LABEL: cmplx_mul_combined_re_im:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    lsr w9, w0, #16
-; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    rev32 v4.8h, v0.8h
 ; CHECK-GI-NEXT:    dup v1.8h, w9
 ; CHECK-GI-NEXT:    fmov s3, w9
 ; CHECK-GI-NEXT:    sqneg v2.8h, v1.8h
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
 ; CHECK-GI-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v1.16b
 ; CHECK-GI-NEXT:    mov d2, v0.d[1]
 ; CHECK-GI-NEXT:    dup v3.8h, w0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 6e5c666..0cd885e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    ldrh w8, [x0]
-; CHECK-NEON-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEON-NEXT:    ldrh w8, [x0, #2]
+; CHECK-NEON-NEXT:    ldr h0, [x0]
 ; CHECK-NEON-NEXT:    ldr d1, [x1]
-; CHECK-NEON-NEXT:    fmov d0, x8
-; CHECK-NEON-NEXT:    mov v0.d[1], x9
+; CHECK-NEON-NEXT:    mov v0.d[1], x8
 ; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    ldrh w8, [x0]
-; CHECK-SVE-NEXT:    ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT:    ldrh w8, [x0, #2]
+; CHECK-SVE-NEXT:    ldr h0, [x0]
 ; CHECK-SVE-NEXT:    ldr d1, [x1]
-; CHECK-SVE-NEXT:    fmov d0, x8
-; CHECK-SVE-NEXT:    mov v0.d[1], x9
+; CHECK-SVE-NEXT:    mov v0.d[1], x8
 ; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
new file mode 100644
index 0000000..cf52934
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; Test optimization of DUP with extended narrow loads
+; This should avoid GPR->SIMD transfers by loading directly into vector registers
+
+define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i16
+  %vec = insertelement <4 x i16> poison, i16 %ext, i32 0
+  %dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer
+  ret <4 x i16> %dup
+}
+
+define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i16
+  %vec = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+  ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, #4]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i8, ptr %p, i64 4
+  %load = load i8, ptr %addr, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i8, ptr %p, i64 %offset
+  %load = load i8, ptr %addr, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i8_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+  ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, #8]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i16, ptr %p, i64 4
+  %load = load i16, ptr %addr, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i16, ptr %p, i64 %offset
+  %load = load i16, ptr %addr, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i16_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i32_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i32_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i32, ptr %p, align 1
+  %ext = zext i32 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 079ff10..670574f2 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -32,8 +32,8 @@ entry:
 define <2 x i8> @loaddup_v2i8(ptr %p) {
 ; CHECK-LABEL: loaddup_v2i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    dup v0.2s, w8
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-NEXT:    ret
 entry:
   %a = load i8, ptr %p
@@ -189,8 +189,8 @@ entry:
 define <4 x i8> @loaddup_v4i8(ptr %p) {
 ; CHECK-SD-LABEL: loaddup_v4i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    dup v0.4h, w8
+; CHECK-SD-NEXT:    ldr b0, [x0]
+; CHECK-SD-NEXT:    dup v0.4h, v0.h[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: loaddup_v4i8:
@@ -444,8 +444,8 @@ entry:
 define <2 x i16> @loaddup_v2i16(ptr %p) {
 ; CHECK-SD-LABEL: loaddup_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    dup v0.2s, w8
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: loaddup_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
index 6177ae5..628506b 100644
--- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
@@ -84,8 +84,7 @@ entry:
 define double @load_u64_from_u32_off1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldur w8, [x0, #1]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur s0, [x0, #1]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 1
@@ -98,8 +97,7 @@ entry:
 define double @load_u64_from_u16_off1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #1]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur h0, [x0, #1]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 1
@@ -125,8 +123,7 @@ entry:
 define float @load_u32_from_u16_off1(ptr %n){
 ; CHECK-LABEL: load_u32_from_u16_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur h0, [x0, #1]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 1
@@ -168,8 +165,7 @@ entry:
 define double @load_u64_from_u32_off2(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_off2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldur w8, [x0, #2]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur s0, [x0, #2]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 2
@@ -250,8 +246,7 @@ entry:
 define double @load_u64_from_u32_off255(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldur w8, [x0, #255]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur s0, [x0, #255]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 255
@@ -264,8 +259,7 @@ entry:
 define double @load_u64_from_u16_off255(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #255]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur h0, [x0, #255]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 255
@@ -291,8 +285,7 @@ entry:
 define float @load_u32_from_u16_off255(ptr %n){
 ; CHECK-LABEL: load_u32_from_u16_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #255]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur h0, [x0, #255]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 255
@@ -494,8 +487,8 @@ entry:
 define double @load_u64_from_u32_offnp1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #4, lsl #12 // =16384
-; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    mov w8, #16384 // =0x4000
+; CHECK-NEXT:    ldr s0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 16384
@@ -508,8 +501,8 @@ entry:
 define double @load_u64_from_u16_offnp1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #2, lsl #12 // =8192
-; CHECK-NEXT:    ldr h0, [x8]
+; CHECK-NEXT:    mov w8, #8192 // =0x2000
+; CHECK-NEXT:    ldr h0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 8192
@@ -522,8 +515,8 @@ entry:
 define double @load_u64_from_u8_offnp1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u8_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #1, lsl #12 // =4096
-; CHECK-NEXT:    ldr b0, [x8]
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 4096
@@ -536,8 +529,8 @@ entry:
 define float @load_u32_from_u16_offnp1(ptr %n){
 ; CHECK-LABEL: load_u32_from_u16_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #2, lsl #12 // =8192
-; CHECK-NEXT:    ldr h0, [x8]
+; CHECK-NEXT:    mov w8, #8192 // =0x2000
+; CHECK-NEXT:    ldr h0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 8192
@@ -550,8 +543,8 @@ entry:
 define float @load_u32_from_u8_offnp1(ptr %n){
 ; CHECK-LABEL: load_u32_from_u8_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #1, lsl #12 // =4096
-; CHECK-NEXT:    ldr b0, [x8]
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 4096
@@ -564,8 +557,8 @@ entry:
 define half @load_u16_from_u8_offnp1(ptr %n){
 ; CHECK-LABEL: load_u16_from_u8_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #1, lsl #12 // =4096
-; CHECK-NEXT:    ldr b0, [x8]
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index b8d6c88..3f35cb5 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -829,7 +829,7 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal
 ; CHECK-SDAG-NEXT:    bl __arm_sme_restore
 ; CHECK-SDAG-NEXT:    b .LBB5_1
 entry:
-  invoke void @agnostic_za_call()
+  invoke void @agnostic_za_call() "aarch64_za_state_agnostic"
           to label %exit unwind label %catch
 
 catch:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 353c09b..ecd7cc2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1778,7 +1778,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1790,7 +1790,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1802,7 +1802,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1815,7 +1815,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
-; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i65 %value, 33
   ret i65 %result
@@ -1875,21 +1875,19 @@ define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
 ; GCN-LABEL: s_ashr_i65_33:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GCN-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-NEXT:    s_mov_b32 s1, 0
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT:    s_lshr_b32 s4, s1, 1
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
 ; GCN-NEXT:    s_ashr_i32 s2, s3, 1
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i65_33:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
-; GFX10PLUS-NEXT:    s_lshl_b64 s[4:5], s[2:3], 31
+; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 1
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
 ; GFX10PLUS-NEXT:    s_ashr_i32 s2, s3, 1
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir
new file mode 100644
index 0000000..48e9818
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir
@@ -0,0 +1,97 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck %s
+
+---
+name: test_combine_or_s64_s32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
+    ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s32) = COPY $sgpr2
+    %2:_(s64) = G_ZEXT %1(s32)
+    %3:_(s64) = G_OR %0, %2
+    $sgpr0_sgpr1 = COPY %3(s64)
+    SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_combine_or_s64_s32_rhs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32_rhs
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
+    ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s32) = COPY $sgpr2
+    %2:_(s64) = G_ZEXT %1(s32)
+    %3:_(s64) = G_OR %2, %0
+    $sgpr0_sgpr1 = COPY %3(s64)
+    SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_combine_or_s64_s32_merge_unmerge
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32_merge_unmerge
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: $sgpr0 = COPY [[OR]](s32)
+    ; CHECK-NEXT: $sgpr1 = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+    %4:_(s64) = G_ZEXT %2(s32)
+    %5:_(s64) = G_OR %3, %4
+    %6:_(s32), %7:_(s32) = G_UNMERGE_VALUES %5(s64)
+    $sgpr0 = COPY %6(s32)
+    $sgpr1 = COPY %7(s32)
+    SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+...
+---
+name: negative_test_incorrect_types
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+    ; CHECK-LABEL: name: negative_test_incorrect_types
+    ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s128) = G_ZEXT [[COPY1]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s128) = G_OR [[COPY]], [[ZEXT]]
+    ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[OR]](s128)
+    %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %1:_(s64) = COPY $vgpr4_vgpr5
+    %2:_(s128) = G_ZEXT %1
+    %3:_(s128) = G_OR %0, %2
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 5dff8c1..667fa98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -227,39 +227,38 @@ exit:
 define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
 ; GFX10-LABEL: single_lane_execution_attribute:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_getpc_b64 s[12:13]
-; GFX10-NEXT:    s_mov_b32 s12, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_mov_b32 s3, s12
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v1, -1, 0
-; GFX10-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
 ; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v1, -1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    s_xor_b32 s2, vcc_lo, exec_lo
-; GFX10-NEXT:    s_and_b32 vcc_lo, s2, exec_lo
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_4
 ; GFX10-NEXT:  ; %bb.1: ; %.preheader.preheader
-; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:  .LBB4_2: ; %.preheader
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s12
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
-; GFX10-NEXT:    s_add_i32 s12, s12, 4
+; GFX10-NEXT:    s_add_i32 s2, s2, 4
 ; GFX10-NEXT:    buffer_load_dword v3, v3, s[4:7], 0 offen
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX10-NEXT:    s_add_i32 s2, s3, s2
+; GFX10-NEXT:    v_readfirstlane_b32 s12, v3
+; GFX10-NEXT:    s_add_i32 s3, s12, s3
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_2
 ; GFX10-NEXT:  ; %bb.3: ; %.preheader._crit_edge
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s2, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s3, v2
 ; GFX10-NEXT:    s_or_b32 s2, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX10-NEXT:    s_branch .LBB4_6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index bd53032..715a777 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -4934,17 +4934,15 @@ define amdgpu_ps i64 @s_fshl_i64_5(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 27
-; GCN-NEXT:    s_mov_b32 s3, 0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_or_b32 s0, s0, s2
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i64_5:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GFX11-NEXT:    s_lshr_b32 s2, s3, 27
-; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5)
   ret i64 %result
@@ -4954,20 +4952,13 @@ define amdgpu_ps i64 @s_fshl_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN-LABEL: s_fshl_i64_32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_mov_b32 s2, s3
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_mov_b32 s0, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i64_32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s2, s3
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, s3
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32)
   ret i64 %result
@@ -6823,56 +6814,50 @@ define amdgpu_ps i128 @s_fshl_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX6-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX6-NEXT:    s_mov_b32 s5, 0
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX6-NEXT:    s_or_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    s_or_b32 s2, s2, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_i128_65:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX8-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_i128_65:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX9-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT:    s_or_b32 s0, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    s_or_b32 s2, s2, s4
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_i128_65:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s2, s5, 31
-; GFX10-NEXT:    s_mov_b32 s3, 0
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX10-NEXT:    s_lshr_b32 s2, s7, 31
-; GFX10-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
+; GFX10-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
+; GFX10-NEXT:    s_lshr_b32 s5, s7, 31
+; GFX10-NEXT:    s_or_b32 s0, s0, s4
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i128_65:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s2, s5, 31
-; GFX11-NEXT:    s_mov_b32 s3, 0
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX11-NEXT:    s_lshr_b32 s2, s7, 31
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
+; GFX11-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
+; GFX11-NEXT:    s_lshr_b32 s5, s7, 31
+; GFX11-NEXT:    s_or_b32 s0, s0, s4
+; GFX11-NEXT:    s_or_b32 s2, s2, s5
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
   ret i128 %result
@@ -6885,7 +6870,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[0:1], 1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[6:7], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -6896,7 +6881,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -6907,7 +6892,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX9-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6919,7 +6904,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 31, v7
-; GFX10-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6931,7 +6916,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 31, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index ea6b3a3..5aa5a671 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -4715,20 +4715,13 @@ define amdgpu_ps i64 @s_fshr_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN-LABEL: s_fshr_i64_32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_mov_b32 s2, s3
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_mov_b32 s0, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i64_32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s2, s3
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, s3
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32)
   ret i64 %result
@@ -4739,17 +4732,15 @@ define amdgpu_ps i64 @s_fshr_i64_48(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 16
-; GCN-NEXT:    s_mov_b32 s3, 0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_or_b32 s0, s0, s2
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i64_48:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX11-NEXT:    s_lshr_b32 s2, s3, 16
-; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48)
   ret i64 %result
@@ -5293,34 +5284,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX6-NEXT:    s_not_b32 s9, s8
-; GFX6-NEXT:    s_sub_i32 s16, s2, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX6-NEXT:    s_sub_i32 s16, s0, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX6-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX6-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX6-NEXT:    s_sub_i32 s14, s0, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX6-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX6-NEXT:    s_sub_i32 s14, s9, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s9
+; GFX6-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5330,9 +5320,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_i128:
@@ -5340,34 +5330,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX8-NEXT:    s_not_b32 s9, s8
-; GFX8-NEXT:    s_sub_i32 s16, s2, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX8-NEXT:    s_sub_i32 s16, s0, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX8-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX8-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX8-NEXT:    s_sub_i32 s14, s0, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s0
-; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX8-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX8-NEXT:    s_sub_i32 s14, s9, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s9
+; GFX8-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5377,9 +5366,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i128:
@@ -5387,34 +5376,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX9-NEXT:    s_not_b32 s9, s8
-; GFX9-NEXT:    s_sub_i32 s16, s2, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX9-NEXT:    s_sub_i32 s16, s0, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX9-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX9-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX9-NEXT:    s_sub_i32 s14, s0, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s0
-; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX9-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX9-NEXT:    s_sub_i32 s14, s9, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s9
+; GFX9-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5424,19 +5412,18 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX10-NEXT:    s_mov_b32 s11, 0
-; GFX10-NEXT:    s_andn2_b32 s9, 0x7f, s8
+; GFX10-NEXT:    s_lshr_b32 s9, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT:    s_or_b32 s2, s2, s9
+; GFX10-NEXT:    s_andn2_b32 s9, 0x7f, s8
 ; GFX10-NEXT:    s_not_b32 s14, s8
 ; GFX10-NEXT:    s_sub_i32 s16, s9, 64
 ; GFX10-NEXT:    s_sub_i32 s10, 64, s9
@@ -5479,11 +5466,10 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-LABEL: s_fshr_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX11-NEXT:    s_mov_b32 s11, 0
-; GFX11-NEXT:    s_and_not1_b32 s9, 0x7f, s8
+; GFX11-NEXT:    s_lshr_b32 s9, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX11-NEXT:    s_or_b32 s2, s2, s9
+; GFX11-NEXT:    s_and_not1_b32 s9, 0x7f, s8
 ; GFX11-NEXT:    s_not_b32 s14, s8
 ; GFX11-NEXT:    s_sub_i32 s16, s9, 64
 ; GFX11-NEXT:    s_sub_i32 s10, 64, s9
@@ -5786,13 +5772,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 64, v7
 ; GFX6-NEXT:    v_not_b32_e32 v8, 63
 ; GFX6-NEXT:    v_lshr_b64 v[1:2], s[8:9], v1
-; GFX6-NEXT:    v_lshl_b64 v[3:4], s[0:1], v7
+; GFX6-NEXT:    v_lshl_b64 v[3:4], s[2:3], v7
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v7, v8
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], s[8:9], v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5803,8 +5788,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v3, s0
-; GFX6-NEXT:    v_mov_b32_e32 v4, s1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5839,13 +5824,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 64, v7
 ; GFX8-NEXT:    v_not_b32_e32 v8, 63
 ; GFX8-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v7, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5856,8 +5840,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5892,12 +5876,11 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v7
 ; GFX9-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
 ; GFX9-NEXT:    v_add_u32_e32 v8, 0xffffffc0, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5908,10 +5891,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0x7f, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5941,34 +5924,33 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-LABEL: v_fshr_i128_ssv:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_bfi_b32 v11, v0, 0, 0x7f
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
+; GFX10-NEXT:    s_lshr_b32 s2, s1, 31
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
+; GFX10-NEXT:    s_or_b32 s8, s8, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xffffffc0, v11
 ; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v11, s[8:9]
-; GFX10-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v9, 64, v12
+; GFX10-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v13, 0xffffffc0, v12
 ; GFX10-NEXT:    v_lshrrev_b64 v[7:8], v12, s[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[9:10], v9, s[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[5:6], v11, s[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[9:10], v9, s[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v4, v2, v4
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s1, 64, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v11
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v7, v9
 ; GFX10-NEXT:    v_or_b32_e32 v7, v8, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s1
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v12, s[6:7]
@@ -5988,18 +5970,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX11-LABEL: v_fshr_i128_ssv:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_bfi_b32 v11, v0, 0, 0x7f
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
+; GFX11-NEXT:    s_lshr_b32 s2, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_mov_b32 s9, 0
+; GFX11-NEXT:    s_or_b32 s8, s8, s2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
 ; GFX11-NEXT:    v_lshlrev_b64 v[5:6], v11, s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v11
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX11-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX11-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v11, s[8:9]
+; GFX11-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX11-NEXT:    v_dual_cndmask_b32 v5, 0, v5 :: v_dual_add_nc_u32 v0, 0xffffffc0, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 64, v12
 ; GFX11-NEXT:    v_lshrrev_b64 v[7:8], v12, s[4:5]
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
@@ -6045,26 +6027,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX6-NEXT:    s_not_b32 s5, s4
-; GFX6-NEXT:    s_sub_i32 s12, s2, 64
-; GFX6-NEXT:    s_sub_i32 s8, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX6-NEXT:    s_sub_i32 s12, s0, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX6-NEXT:    s_sub_i32 s4, 64, s0
@@ -6073,14 +6054,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s0
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s0
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s1
 ; GFX6-NEXT:    s_and_b32 s0, 1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s8
+; GFX6-NEXT:    s_and_b32 s0, 1, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6088,10 +6069,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_svs:
@@ -6099,26 +6080,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX8-NEXT:    s_not_b32 s5, s4
-; GFX8-NEXT:    s_sub_i32 s12, s2, 64
-; GFX8-NEXT:    s_sub_i32 s8, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX8-NEXT:    s_sub_i32 s12, s0, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX8-NEXT:    s_sub_i32 s4, 64, s0
@@ -6127,14 +6107,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s5
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s8
+; GFX8-NEXT:    s_and_b32 s0, 1, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6142,10 +6122,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_svs:
@@ -6153,26 +6133,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX9-NEXT:    s_not_b32 s5, s4
-; GFX9-NEXT:    s_sub_i32 s12, s2, 64
-; GFX9-NEXT:    s_sub_i32 s8, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX9-NEXT:    s_sub_i32 s12, s0, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX9-NEXT:    s_sub_i32 s4, 64, s0
@@ -6181,14 +6160,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s5
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s8
+; GFX9-NEXT:    s_and_b32 s0, 1, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6196,20 +6175,19 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX9-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_svs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX10-NEXT:    s_mov_b32 s7, 0
-; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
+; GFX10-NEXT:    s_lshr_b32 s5, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
+; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX10-NEXT:    s_not_b32 s10, s4
 ; GFX10-NEXT:    s_sub_i32 s12, s5, 64
 ; GFX10-NEXT:    s_sub_i32 s6, 64, s5
@@ -6259,11 +6237,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-LABEL: v_fshr_i128_svs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX11-NEXT:    s_mov_b32 s7, 0
-; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
+; GFX11-NEXT:    s_lshr_b32 s5, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX11-NEXT:    s_or_b32 s2, s2, s5
+; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
 ; GFX11-NEXT:    s_not_b32 s10, s4
 ; GFX11-NEXT:    s_sub_i32 s12, s5, 64
 ; GFX11-NEXT:    s_sub_i32 s6, 64, s5
@@ -6714,81 +6691,80 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX6-NEXT:    s_mov_b32 s23, 0
 ; GFX6-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX6-NEXT:    s_not_b32 s17, s16
-; GFX6-NEXT:    s_sub_i32 s21, s2, 64
-; GFX6-NEXT:    s_sub_i32 s22, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX6-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX6-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX6-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX6-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX6-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX6-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX6-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX6-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX6-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX6-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX6-NEXT:    s_sub_i32 s21, s17, 64
+; GFX6-NEXT:    s_sub_i32 s22, 64, s17
+; GFX6-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX6-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX6-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX6-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX6-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX6-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX6-NEXT:    s_or_b32 s6, s6, s4
+; GFX6-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX6-NEXT:    s_not_b32 s16, s20
-; GFX6-NEXT:    s_sub_i32 s18, s6, 64
-; GFX6-NEXT:    s_sub_i32 s10, 64, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX6-NEXT:    s_sub_i32 s18, s4, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX6-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX6-NEXT:    s_sub_i32 s18, s4, 64
-; GFX6-NEXT:    s_sub_i32 s16, 64, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX6-NEXT:    s_sub_i32 s18, s8, 64
+; GFX6-NEXT:    s_sub_i32 s16, 64, s8
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6796,88 +6772,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX6-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX8-NEXT:    s_mov_b32 s23, 0
 ; GFX8-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX8-NEXT:    s_not_b32 s17, s16
-; GFX8-NEXT:    s_sub_i32 s21, s2, 64
-; GFX8-NEXT:    s_sub_i32 s22, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX8-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX8-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX8-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX8-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX8-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX8-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX8-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX8-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX8-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX8-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX8-NEXT:    s_sub_i32 s21, s17, 64
+; GFX8-NEXT:    s_sub_i32 s22, 64, s17
+; GFX8-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX8-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX8-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX8-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX8-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX8-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX8-NEXT:    s_or_b32 s6, s6, s4
+; GFX8-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX8-NEXT:    s_not_b32 s16, s20
-; GFX8-NEXT:    s_sub_i32 s18, s6, 64
-; GFX8-NEXT:    s_sub_i32 s10, 64, s6
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX8-NEXT:    s_sub_i32 s18, s4, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX8-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX8-NEXT:    s_sub_i32 s18, s4, 64
-; GFX8-NEXT:    s_sub_i32 s16, 64, s4
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX8-NEXT:    s_sub_i32 s18, s8, 64
+; GFX8-NEXT:    s_sub_i32 s16, 64, s8
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6885,88 +6860,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX8-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX9-NEXT:    s_mov_b32 s23, 0
 ; GFX9-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX9-NEXT:    s_not_b32 s17, s16
-; GFX9-NEXT:    s_sub_i32 s21, s2, 64
-; GFX9-NEXT:    s_sub_i32 s22, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX9-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX9-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX9-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX9-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX9-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX9-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX9-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX9-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX9-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX9-NEXT:    s_sub_i32 s21, s17, 64
+; GFX9-NEXT:    s_sub_i32 s22, 64, s17
+; GFX9-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX9-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX9-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX9-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX9-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX9-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX9-NEXT:    s_or_b32 s6, s6, s4
+; GFX9-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX9-NEXT:    s_not_b32 s16, s20
-; GFX9-NEXT:    s_sub_i32 s18, s6, 64
-; GFX9-NEXT:    s_sub_i32 s10, 64, s6
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX9-NEXT:    s_sub_i32 s18, s4, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX9-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX9-NEXT:    s_sub_i32 s18, s4, 64
-; GFX9-NEXT:    s_sub_i32 s16, 64, s4
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX9-NEXT:    s_sub_i32 s18, s8, 64
+; GFX9-NEXT:    s_sub_i32 s16, 64, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6974,61 +6948,60 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX9-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s18, s1, 31
-; GFX10-NEXT:    s_mov_b32 s19, 0
-; GFX10-NEXT:    s_andn2_b32 s17, 0x7f, s16
+; GFX10-NEXT:    s_lshr_b32 s17, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
-; GFX10-NEXT:    s_not_b32 s18, s16
-; GFX10-NEXT:    s_sub_i32 s21, s17, 64
-; GFX10-NEXT:    s_sub_i32 s22, 64, s17
+; GFX10-NEXT:    s_or_b32 s2, s2, s17
+; GFX10-NEXT:    s_andn2_b32 s17, 0x7f, s16
+; GFX10-NEXT:    s_not_b32 s21, s16
+; GFX10-NEXT:    s_sub_i32 s26, s17, 64
+; GFX10-NEXT:    s_sub_i32 s18, 64, s17
 ; GFX10-NEXT:    s_cmp_lt_u32 s17, 64
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
-; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_lshr_b64 s[18:19], s[0:1], s18
+; GFX10-NEXT:    s_lshl_b64 s[22:23], s[2:3], s21
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[0:1], s21
+; GFX10-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s26
+; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cselect_b64 s[22:23], s[24:25], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s0, s16, 0x7f
-; GFX10-NEXT:    s_sub_i32 s18, s0, 64
+; GFX10-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX10-NEXT:    s_sub_i32 s17, 64, s0
 ; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
-; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
-; GFX10-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[10:11], s17
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
+; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX10-NEXT:    s_lshr_b32 s8, s5, 31
+; GFX10-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_or_b32 s6, s6, s8
 ; GFX10-NEXT:    s_andn2_b32 s8, 0x7f, s20
-; GFX10-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
 ; GFX10-NEXT:    s_not_b32 s16, s20
 ; GFX10-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
@@ -7071,54 +7044,53 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-LABEL: s_fshr_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s18, s1, 31
-; GFX11-NEXT:    s_mov_b32 s19, 0
-; GFX11-NEXT:    s_and_not1_b32 s17, 0x7f, s16
+; GFX11-NEXT:    s_lshr_b32 s17, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
-; GFX11-NEXT:    s_not_b32 s18, s16
-; GFX11-NEXT:    s_sub_i32 s21, s17, 64
-; GFX11-NEXT:    s_sub_i32 s22, 64, s17
+; GFX11-NEXT:    s_or_b32 s2, s2, s17
+; GFX11-NEXT:    s_and_not1_b32 s17, 0x7f, s16
+; GFX11-NEXT:    s_not_b32 s21, s16
+; GFX11-NEXT:    s_sub_i32 s26, s17, 64
+; GFX11-NEXT:    s_sub_i32 s18, 64, s17
 ; GFX11-NEXT:    s_cmp_lt_u32 s17, 64
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
-; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_lshr_b64 s[18:19], s[0:1], s18
+; GFX11-NEXT:    s_lshl_b64 s[22:23], s[2:3], s21
+; GFX11-NEXT:    s_lshl_b64 s[24:25], s[0:1], s21
+; GFX11-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s26
+; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_cselect_b64 s[22:23], s[24:25], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s18, s0, 64
+; GFX11-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX11-NEXT:    s_sub_i32 s17, 64, s0
 ; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
-; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
-; GFX11-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[10:11], s17
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
+; GFX11-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX11-NEXT:    s_lshr_b32 s8, s5, 31
+; GFX11-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_or_b32 s6, s6, s8
 ; GFX11-NEXT:    s_and_not1_b32 s8, 0x7f, s20
-; GFX11-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
 ; GFX11-NEXT:    s_not_b32 s16, s20
 ; GFX11-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
new file mode 100644
index 0000000..4b214e6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12  %s
+
+---
+name: test_fmaximum_f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[TRUNC]], [[TRUNC1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[TRUNC]], [[TRUNC1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMAXIMUM]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s16) = G_TRUNC %2(s32)
+    %4:_(s16) = G_FMAXIMUM %1, %3
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $vgpr0 = COPY %5(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; GFX9-LABEL: name: test_fmaximum_f64
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s64) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMAXIMUM]](s64)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FMAXIMUM %0, %1
+    $vgpr0_vgpr1 = COPY %2(s64)
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fmaximum_v2f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_v2f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_v2f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](<2 x s16>)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(<2 x s16>)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_v2f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; GFX9-LABEL: name: test_fmaximum_v2f32
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY2]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY1]], [[COPY3]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMAXNUM_IEEE1]], [[C]]
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_v2f32
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY2]]
+    ; GFX12-NEXT: [[FMAXIMUM1:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY1]], [[COPY3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: $vgpr1 = COPY [[FMAXIMUM1]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+    %6:_(<2 x s32>) = G_FMAXIMUM %2, %5
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+    $vgpr0 = COPY %7(s32)
+    $vgpr1 = COPY %8(s32)
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fmaximum_nsz_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_nsz_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_nsz_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nsz G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nsz G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_nnan_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_nnan_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMAXNUM_IEEE]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_nnan_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nnan G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nnan G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir
new file mode 100644
index 0000000..8ba0794
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12  %s
+
+---
+name: test_fminimum_f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[TRUNC]], [[TRUNC1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s16) = G_FMINIMUM [[TRUNC]], [[TRUNC1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMINIMUM]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s16) = G_TRUNC %2(s32)
+    %4:_(s16) = G_FMINIMUM %1, %3
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $vgpr0 = COPY %5(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; GFX9-LABEL: name: test_fminimum_f64
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s64) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMINIMUM]](s64)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FMINIMUM %0, %1
+    $vgpr0_vgpr1 = COPY %2(s64)
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fminimum_v2f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_v2f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMINNUM_IEEE]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_v2f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](<2 x s16>)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(<2 x s16>)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_v2f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; GFX9-LABEL: name: test_fminimum_v2f32
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY2]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY1]], [[COPY3]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMINNUM_IEEE1]], [[C]]
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fminimum_v2f32
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY]], [[COPY2]]
+    ; GFX12-NEXT: [[FMINIMUM1:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY1]], [[COPY3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: $vgpr1 = COPY [[FMINIMUM1]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+    %6:_(<2 x s32>) = G_FMINIMUM %2, %5
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+    $vgpr0 = COPY %7(s32)
+    $vgpr1 = COPY %8(s32)
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fminimum_nsz_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_nsz_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_nsz_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = nsz G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nsz G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_nnan_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_nnan_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMINNUM_IEEE]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_nnan_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = nnan G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nnan G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 8533e34..518af70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1750,7 +1750,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1763,7 +1763,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1776,7 +1776,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1789,7 +1789,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1800,7 +1800,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i65 %value, 33
@@ -1859,21 +1859,19 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
 ; GCN-LABEL: s_lshr_i65_33:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], 1
-; GCN-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-NEXT:    s_mov_b32 s1, 0
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_lshr_b32 s4, s1, 1
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
 ; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i65_33:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[2:3], 1
-; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
-; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 1
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
 ; GFX10PLUS-NEXT:    s_mov_b32 s2, 0
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
index af377b1..e0581f01 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
@@ -597,13 +597,13 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX7-NEXT:    s_mov_b32 s5, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s3, s[2:3], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_or_b64 s[4:5], s[4:5], 0x50
+; GFX7-NEXT:    s_or_b32 s4, s3, 0x50
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -616,7 +616,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX8-NEXT:    s_mov_b32 s3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX8-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -630,7 +630,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX9-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -644,7 +644,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX10-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -658,7 +658,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX11-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -671,7 +671,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_mov_b32 s3, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX12-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index a9b3deb..cfe655f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1381,7 +1381,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1393,7 +1393,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1405,7 +1405,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1418,7 +1418,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
-; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i65 %value, 33
   %ashr = ashr i65 %value, 33
@@ -1429,29 +1429,27 @@ define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
 ; GCN-LABEL: s_sext_inreg_i65_18:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], 18
-; GCN-NEXT:    s_lshr_b32 s4, s1, 14
-; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_lshr_b32 s3, s1, 14
+; GCN-NEXT:    s_or_b32 s2, s2, s3
 ; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
 ; GCN-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GCN-NEXT:    s_lshl_b32 s7, s2, 14
-; GCN-NEXT:    s_mov_b32 s6, s5
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GCN-NEXT:    s_lshl_b32 s5, s2, 14
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 18
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i65_18:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 14
-; GFX10PLUS-NEXT:    s_mov_b32 s5, 0
+; GFX10PLUS-NEXT:    s_lshr_b32 s3, s1, 14
 ; GFX10PLUS-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GFX10PLUS-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
-; GFX10PLUS-NEXT:    s_mov_b32 s6, s5
+; GFX10PLUS-NEXT:    s_or_b32 s2, s2, s3
+; GFX10PLUS-NEXT:    s_mov_b32 s4, 0
 ; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT:    s_lshl_b32 s7, s2, 14
+; GFX10PLUS-NEXT:    s_lshl_b32 s5, s2, 14
 ; GFX10PLUS-NEXT:    s_ashr_i64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i65 %value, 18
   %ashr = ashr i65 %shl, 18
@@ -1464,13 +1462,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) {
 ; GCN-NEXT:    s_lshl_b32 s3, s2, 1
 ; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    s_lshr_b64 s[4:5], s[0:1], 31
-; GCN-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0x1f0000
-; GCN-NEXT:    s_mov_b32 s1, s2
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[4:5], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_ashr_i32 s2, s5, 1
+; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT:    s_bfe_u32 s4, s0, 0x1f0000
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
+; GCN-NEXT:    s_ashr_i32 s2, s3, 1
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i65_33:
@@ -1478,13 +1475,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) {
 ; GFX10PLUS-NEXT:    s_lshl_b32 s3, s2, 1
 ; GFX10PLUS-NEXT:    s_mov_b32 s2, 0
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[0:1], 31
-; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x1f0000
-; GFX10PLUS-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
-; GFX10PLUS-NEXT:    s_mov_b32 s1, s2
-; GFX10PLUS-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[4:5], 31
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX10PLUS-NEXT:    s_ashr_i32 s2, s5, 1
+; GFX10PLUS-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX10PLUS-NEXT:    s_bfe_u32 s4, s0, 0x1f0000
+; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GFX10PLUS-NEXT:    s_ashr_i32 s2, s3, 1
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i65 %value, 33
   %ashr = ashr i65 %shl, 33
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index afd0f01..6831380 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -415,28 +415,18 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-LABEL: memcpy_known:
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
+; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x2000
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT:    s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x2000
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GISEL-GFX942-NEXT:  .LBB0_1: ; %load-store-loop
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -491,25 +481,16 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_load_b32 s15, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s17, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s17
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s17
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s12
-; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s17
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, 0
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX1100-NEXT:    s_mov_b32 s2, s17
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s3
-; GISEL-GFX1100-NEXT:    s_mov_b32 s3, s10
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[16:17], s[6:7]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s9
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[12:13], s[16:17], s[2:3]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s11
-; GISEL-GFX1100-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
+; GISEL-GFX1100-NEXT:    s_mov_b32 s13, s10
+; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s11
 ; GISEL-GFX1100-NEXT:  .LBB0_1: ; %load-store-loop
 ; GISEL-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
@@ -960,28 +941,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-LABEL: memcpy_known_medium:
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
+; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT:    s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GISEL-GFX942-NEXT:  .LBB1_1: ; %load-store-loop
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1036,25 +1007,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_load_b32 s15, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s17, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s17
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s17
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s12
-; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s17
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, 0
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX1100-NEXT:    s_mov_b32 s2, s17
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s3
-; GISEL-GFX1100-NEXT:    s_mov_b32 s3, s10
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[16:17], s[6:7]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s9
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[12:13], s[16:17], s[2:3]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s11
-; GISEL-GFX1100-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
+; GISEL-GFX1100-NEXT:    s_mov_b32 s13, s10
+; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s11
 ; GISEL-GFX1100-NEXT:  .LBB1_1: ; %load-store-loop
 ; GISEL-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
@@ -1228,27 +1190,18 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v4, s0
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_load_dword s13, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
-; GISEL-GFX942-NEXT:    s_mov_b32 s12, s7
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v5, s0
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen
@@ -1261,35 +1214,24 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; GISEL-GFX1100:       ; %bb.0:
 ; GISEL-GFX1100-NEXT:    s_clause 0x1
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
-; GISEL-GFX1100-NEXT:    s_mov_b32 s13, 0
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s8, s13
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s13
+; GISEL-GFX1100-NEXT:    s_load_b32 s11, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s1
-; GISEL-GFX1100-NEXT:    s_mov_b32 s9, s2
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v4, s0
-; GISEL-GFX1100-NEXT:    s_or_b64 s[0:1], s[12:13], s[8:9]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s3
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[2:3], s[12:13], s[6:7]
-; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[0:3], 0 offen
+; GISEL-GFX1100-NEXT:    s_mov_b32 s8, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s9, s2
+; GISEL-GFX1100-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX1100-NEXT:    s_clause 0x1
-; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
+; GISEL-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
-; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s10
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v5, s8
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[12:13], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s11
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v5, s0
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
 ; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[0:3], v5, s[4:7], 0 offen
-; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[0:3], 0 offen offset:16
+; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen offset:16
 ; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[0:3], v5, s[4:7], 0 offen offset:16
 ; GISEL-GFX1100-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 6c8207a..df7f8c6 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -4344,7 +4344,7 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v3, 1, v4
-; GFX9-G-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-G-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX9-G-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-G-NEXT:    s_setpc_b64 s[30:31]
@@ -4375,14 +4375,12 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-G-O0-NEXT:    s_mov_b32 s5, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v0, v0, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v2, v[5:6]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v3
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v0, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v0, v[5:6]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v2
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v3, v2, v4
@@ -4437,7 +4435,7 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[0:1], 31, v[2:3]
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v2, 1, v4
-; GFX9-G-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX9-G-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-G-NEXT:    s_setpc_b64 s[30:31]
@@ -4450,15 +4448,13 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v0, v0, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v0, v1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 31
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v2, v[4:5]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v4
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[4:5], v0, v[4:5]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v2
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v2, v3
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index b5b2655..31344c7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2080,21 +2080,13 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrs
 }
 
 define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
-; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-GISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
   %addr = inttoptr i64 %or to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index e59fbad..62ec010 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -1,117 +1,296 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fmaximum_f32_vv(float %a, float %b) {
-; GCN-LABEL: test_fmaximum_f32_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_f32_ss(float inreg %a, float inreg %b) {
-; GCN-LABEL: test_fmaximum_f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_f32_vs(float %a, float inreg %b) {
-; GCN-LABEL: test_fmaximum_f32_vs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_vs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_vs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_nnan_f32(float %a, float %b) {
-; GCN-LABEL: test_fmaximum_nnan_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_nnan_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_nnan_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call nnan float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
+define amdgpu_ps float @test_fmaximum_nsz_f32(float %a, float %b) {
+; GFX9-LABEL: test_fmaximum_nsz_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_nsz_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call nsz float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fmaximum_signed_zero_f32() {
+; GFX9-LABEL: test_fmaximum_signed_zero_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_signed_zero_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float -0.0, float 0.0)
+  ret float %val
+}
+
 define amdgpu_ps <2 x float> @test_fmaximum_v2f32(<2 x float> %a, <2 x float> %b) {
-; GCN-LABEL: test_fmaximum_v2f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v2
-; GCN-NEXT:    v_maximum_f32 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v2
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_v2f32_ss(<2 x float> inreg %a, <2 x float> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f32 s0, s0, s2
-; GCN-NEXT:    s_maximum_f32 s1, s1, s3
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v2f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_max_f32_e32 v3, s1, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s2
+; GFX12-NEXT:    s_maximum_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <3 x float> @test_fmaximum_v3f32(<3 x float> %a, <3 x float> %b) {
-; GCN-LABEL: test_fmaximum_v3f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v3
-; GCN-NEXT:    v_maximum_f32 v1, v1, v4
-; GCN-NEXT:    v_maximum_f32 v2, v2, v5
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v3
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v4
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v5
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %val
 }
 
 define amdgpu_ps <4 x float> @test_fmaximum_v4f32(<4 x float> %a, <4 x float> %b) {
-; GCN-LABEL: test_fmaximum_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v4
-; GCN-NEXT:    v_maximum_f32 v1, v1, v5
-; GCN-NEXT:    v_maximum_f32 v2, v2, v6
-; GCN-NEXT:    v_maximum_f32 v3, v3, v7
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v4
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v5
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v6
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v7
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x float> @llvm.maximum.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %val
 }
 
 define amdgpu_ps <16 x float> @test_fmaximum_v16f32(<16 x float> %a, <16 x float> %b) {
-; GCN-LABEL: test_fmaximum_v16f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v16
-; GCN-NEXT:    v_maximum_f32 v1, v1, v17
-; GCN-NEXT:    v_maximum_f32 v2, v2, v18
-; GCN-NEXT:    v_maximum_f32 v3, v3, v19
-; GCN-NEXT:    v_maximum_f32 v4, v4, v20
-; GCN-NEXT:    v_maximum_f32 v5, v5, v21
-; GCN-NEXT:    v_maximum_f32 v6, v6, v22
-; GCN-NEXT:    v_maximum_f32 v7, v7, v23
-; GCN-NEXT:    v_maximum_f32 v8, v8, v24
-; GCN-NEXT:    v_maximum_f32 v9, v9, v25
-; GCN-NEXT:    v_maximum_f32 v10, v10, v26
-; GCN-NEXT:    v_maximum_f32 v11, v11, v27
-; GCN-NEXT:    v_maximum_f32 v12, v12, v28
-; GCN-NEXT:    v_maximum_f32 v13, v13, v29
-; GCN-NEXT:    v_maximum_f32 v14, v14, v30
-; GCN-NEXT:    v_maximum_f32 v15, v15, v31
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v32, v1, v17
+; GFX9-NEXT:    v_mov_b32_e32 v33, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v16
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v0, v16
+; GFX9-NEXT:    v_max_f32_e32 v17, v2, v18
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[0:1], v2, v18
+; GFX9-NEXT:    v_max_f32_e32 v18, v3, v19
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[2:3], v3, v19
+; GFX9-NEXT:    v_max_f32_e32 v19, v4, v20
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v4, v20
+; GFX9-NEXT:    v_max_f32_e32 v20, v5, v21
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v5, v21
+; GFX9-NEXT:    v_max_f32_e32 v21, v6, v22
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v6, v22
+; GFX9-NEXT:    v_max_f32_e32 v22, v7, v23
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v7, v23
+; GFX9-NEXT:    v_max_f32_e32 v23, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v33, v1, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_max_f32_e32 v34, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v33, v23, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_max_f32_e32 v35, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_max_f32_e32 v36, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v33, v35, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_max_f32_e32 v37, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v33, v36, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_max_f32_e32 v16, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v33, v37, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v33, v16, vcc
+; GFX9-NEXT:    v_max_f32_e32 v16, v14, v30
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v33, v16, vcc
+; GFX9-NEXT:    v_max_f32_e32 v16, v15, v31
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v33, v17, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v33, v18, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v33, v19, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v33, v20, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v33, v21, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v33, v22, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v33, v16, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v16f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v16
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v17
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v18
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v19
+; GFX12-NEXT:    v_maximum_f32 v4, v4, v20
+; GFX12-NEXT:    v_maximum_f32 v5, v5, v21
+; GFX12-NEXT:    v_maximum_f32 v6, v6, v22
+; GFX12-NEXT:    v_maximum_f32 v7, v7, v23
+; GFX12-NEXT:    v_maximum_f32 v8, v8, v24
+; GFX12-NEXT:    v_maximum_f32 v9, v9, v25
+; GFX12-NEXT:    v_maximum_f32 v10, v10, v26
+; GFX12-NEXT:    v_maximum_f32 v11, v11, v27
+; GFX12-NEXT:    v_maximum_f32 v12, v12, v28
+; GFX12-NEXT:    v_maximum_f32 v13, v13, v29
+; GFX12-NEXT:    v_maximum_f32 v14, v14, v30
+; GFX12-NEXT:    v_maximum_f32 v15, v15, v31
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <16 x float> @llvm.maximum.v16f32(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %val
 }
 
 define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
+; GFX9-LABEL: test_fmaximum_f16_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-TRUE16-LABEL: test_fmaximum_f16_vv:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v1.l
@@ -136,35 +315,131 @@ define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
 }
 
 define amdgpu_ps half @test_fmaximum_f16_ss(half inreg %a, half inreg %b) {
-; GCN-LABEL: test_fmaximum_f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f16 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f16_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call half @llvm.maximum.f16(half %a, half %b)
   ret half %val
 }
 
 define amdgpu_ps <2 x half> @test_fmaximum_v2f16_vv(<2 x half> %a, <2 x half> %b) {
-; GCN-LABEL: test_fmaximum_v2f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f16_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <2 x half> @test_fmaximum_v2f16_ss(<2 x half> inreg %a, <2 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, s0, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
+; GFX9-SDAG-LABEL: test_fmaximum_v3f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v3f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v2
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fmaximum_v3f16_vv:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v0, v0, v2
@@ -187,6 +462,49 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b
 }
 
 define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
+; GFX9-SDAG-LABEL: test_fmaximum_v3f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, s0, v3
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v3f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s1, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fmaximum_v3f16_ss:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v0, s0, s2
@@ -206,97 +524,384 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x ha
 }
 
 define amdgpu_ps <4 x half> @test_fmaximum_v4f16(<4 x half> %a, <4 x half> %b) {
-; GCN-LABEL: test_fmaximum_v4f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, v0, v2
-; GCN-NEXT:    v_pk_maximum_f16 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v6, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <4 x half> @test_fmaximum_v4f16_ss(<4 x half> inreg %a, <4 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v4f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, s0, s2
-; GCN-NEXT:    v_pk_maximum_f16 v1, s1, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, s0, v4
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s1, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, s0, s2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, s1, s3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_f64_vv(double %a, double %b) {
-; GCN-LABEL: test_fmaximum_f64_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_f64_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_f64_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f64_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.maximum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_f64_ss(double inreg %a, double inreg %b) {
-; GCN-LABEL: test_fmaximum_f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.maximum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <4 x float> @test_fmaximum_v2f64_ss(<2 x double> inreg %a, <2 x double> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[4:5]
-; GCN-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x double> @llvm.maximum.v2f64(<2 x double> %a, <2 x double> %b)
   %ret = bitcast <2 x double> %val to <4 x float>
   ret <4 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fmaximum_v4f64(<4 x double> %a, <4 x double> %b) {
-; GCN-LABEL: test_fmaximum_v4f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
-; GCN-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
-; GCN-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
-; GCN-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v16, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v18, v17, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v18, v9, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v18, v11, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v18, v13, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fmaximum_v4f64_ss(<4 x double> inreg %a, <4 x double> inreg %b) {
-; GCN-LABEL: test_fmaximum_v4f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[8:9]
-; GCN-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[10:11]
-; GCN-NEXT:    v_maximum_f64 v[4:5], s[4:5], s[12:13]
-; GCN-NEXT:    v_maximum_f64 v[6:7], s[6:7], s[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-SDAG-NEXT:    v_max_f64 v[6:7], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-SDAG-NEXT:    v_max_f64 v[8:9], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-GISEL-NEXT:    v_max_f64 v[6:7], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-GISEL-NEXT:    v_max_f64 v[8:9], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v10, v5, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, v7, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v10, v9, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], s[4:5], s[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], s[6:7], s[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fmaximumi_f32_move_to_valu:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_maximum_f32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: fmaximumi_f32_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fmaximumi_f32_move_to_valu:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %v = call float @llvm.maximum.f32(float %a, float %b)
@@ -305,6 +910,23 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; GFX9-LABEL: fmaximum_f16_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
 ; GFX12-SDAG-TRUE16-LABEL: fmaximum_f16_move_to_valu:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
@@ -371,6 +993,40 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
   ret void
 }
 
+define amdgpu_ps float @test_fmaximum_f32_ieee_on(float %a, float %b) #0 {
+; GFX9-LABEL: test_fmaximum_f32_ieee_on:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ieee_on:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fmaximum_f32_ieee_off(float %a, float %b) #1 {
+; GFX9-LABEL: test_fmaximum_f32_ieee_off:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ieee_off:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
 declare float @llvm.maximum.f32(float, float)
 declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
 declare <3 x float> @llvm.maximum.v3f32(<3 x float>, <3 x float>)
@@ -383,3 +1039,6 @@ declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.maximum.f64(double, double)
 declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
+
+attributes #0 = { nounwind "amdgpu-ieee"="true" }
+attributes #1 = { nounwind "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index b25120f..474ac7c 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -1,117 +1,296 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fminimum_f32_vv(float %a, float %b) {
-; GCN-LABEL: test_fminimum_f32_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_f32_ss(float inreg %a, float inreg %b) {
-; GCN-LABEL: test_fminimum_f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_f32_vs(float %a, float inreg %b) {
-; GCN-LABEL: test_fminimum_f32_vs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_vs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_vs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_nnan_f32(float %a, float %b) {
-; GCN-LABEL: test_fminimum_nnan_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_nnan_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_nnan_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call nnan float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
+define amdgpu_ps float @test_fminimum_nsz_f32(float %a, float %b) {
+; GFX9-LABEL: test_fminimum_nsz_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_nsz_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call nsz float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fminimum_signed_zero_f32() {
+; GFX9-LABEL: test_fminimum_signed_zero_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_signed_zero_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float -0.0, float 0.0)
+  ret float %val
+}
+
 define amdgpu_ps <2 x float> @test_fminimum_v2f32(<2 x float> %a, <2 x float> %b) {
-; GCN-LABEL: test_fminimum_v2f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v2
-; GCN-NEXT:    v_minimum_f32 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v2
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_v2f32_ss(<2 x float> inreg %a, <2 x float> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f32 s0, s0, s2
-; GCN-NEXT:    s_minimum_f32 s1, s1, s3
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v2f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_min_f32_e32 v3, s1, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s2
+; GFX12-NEXT:    s_minimum_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <3 x float> @test_fminimum_v3f32(<3 x float> %a, <3 x float> %b) {
-; GCN-LABEL: test_fminimum_v3f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v3
-; GCN-NEXT:    v_minimum_f32 v1, v1, v4
-; GCN-NEXT:    v_minimum_f32 v2, v2, v5
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v6, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_min_f32_e32 v3, v1, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v3, v2, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v3
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v4
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v5
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %val
 }
 
 define amdgpu_ps <4 x float> @test_fminimum_v4f32(<4 x float> %a, <4 x float> %b) {
-; GCN-LABEL: test_fminimum_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v4
-; GCN-NEXT:    v_minimum_f32 v1, v1, v5
-; GCN-NEXT:    v_minimum_f32 v2, v2, v6
-; GCN-NEXT:    v_minimum_f32 v3, v3, v7
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v8, v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v1, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v2, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v3, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v4
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v5
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v6
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v7
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x float> @llvm.minimum.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %val
 }
 
 define amdgpu_ps <16 x float> @test_fminimum_v16f32(<16 x float> %a, <16 x float> %b) {
-; GCN-LABEL: test_fminimum_v16f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v16
-; GCN-NEXT:    v_minimum_f32 v1, v1, v17
-; GCN-NEXT:    v_minimum_f32 v2, v2, v18
-; GCN-NEXT:    v_minimum_f32 v3, v3, v19
-; GCN-NEXT:    v_minimum_f32 v4, v4, v20
-; GCN-NEXT:    v_minimum_f32 v5, v5, v21
-; GCN-NEXT:    v_minimum_f32 v6, v6, v22
-; GCN-NEXT:    v_minimum_f32 v7, v7, v23
-; GCN-NEXT:    v_minimum_f32 v8, v8, v24
-; GCN-NEXT:    v_minimum_f32 v9, v9, v25
-; GCN-NEXT:    v_minimum_f32 v10, v10, v26
-; GCN-NEXT:    v_minimum_f32 v11, v11, v27
-; GCN-NEXT:    v_minimum_f32 v12, v12, v28
-; GCN-NEXT:    v_minimum_f32 v13, v13, v29
-; GCN-NEXT:    v_minimum_f32 v14, v14, v30
-; GCN-NEXT:    v_minimum_f32 v15, v15, v31
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v32, v1, v17
+; GFX9-NEXT:    v_mov_b32_e32 v33, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v16
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v0, v16
+; GFX9-NEXT:    v_min_f32_e32 v17, v2, v18
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[0:1], v2, v18
+; GFX9-NEXT:    v_min_f32_e32 v18, v3, v19
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[2:3], v3, v19
+; GFX9-NEXT:    v_min_f32_e32 v19, v4, v20
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v4, v20
+; GFX9-NEXT:    v_min_f32_e32 v20, v5, v21
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v5, v21
+; GFX9-NEXT:    v_min_f32_e32 v21, v6, v22
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v6, v22
+; GFX9-NEXT:    v_min_f32_e32 v22, v7, v23
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v7, v23
+; GFX9-NEXT:    v_min_f32_e32 v23, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v33, v1, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_min_f32_e32 v34, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v33, v23, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_min_f32_e32 v35, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_min_f32_e32 v36, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v33, v35, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_min_f32_e32 v37, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v33, v36, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_min_f32_e32 v16, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v33, v37, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v33, v16, vcc
+; GFX9-NEXT:    v_min_f32_e32 v16, v14, v30
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v33, v16, vcc
+; GFX9-NEXT:    v_min_f32_e32 v16, v15, v31
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v33, v17, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v33, v18, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v33, v19, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v33, v20, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v33, v21, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v33, v22, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v33, v16, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v16f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v16
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v17
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v18
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v19
+; GFX12-NEXT:    v_minimum_f32 v4, v4, v20
+; GFX12-NEXT:    v_minimum_f32 v5, v5, v21
+; GFX12-NEXT:    v_minimum_f32 v6, v6, v22
+; GFX12-NEXT:    v_minimum_f32 v7, v7, v23
+; GFX12-NEXT:    v_minimum_f32 v8, v8, v24
+; GFX12-NEXT:    v_minimum_f32 v9, v9, v25
+; GFX12-NEXT:    v_minimum_f32 v10, v10, v26
+; GFX12-NEXT:    v_minimum_f32 v11, v11, v27
+; GFX12-NEXT:    v_minimum_f32 v12, v12, v28
+; GFX12-NEXT:    v_minimum_f32 v13, v13, v29
+; GFX12-NEXT:    v_minimum_f32 v14, v14, v30
+; GFX12-NEXT:    v_minimum_f32 v15, v15, v31
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <16 x float> @llvm.minimum.v16f32(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %val
 }
 
 define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
+; GFX9-LABEL: test_fminimum_f16_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-TRUE16-LABEL: test_fminimum_f16_vv:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v1.l
@@ -136,35 +315,131 @@ define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
 }
 
 define amdgpu_ps half @test_fminimum_f16_ss(half inreg %a, half inreg %b) {
-; GCN-LABEL: test_fminimum_f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f16 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f16_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call half @llvm.minimum.f16(half %a, half %b)
   ret half %val
 }
 
 define amdgpu_ps <2 x half> @test_fminimum_v2f16_vv(<2 x half> %a, <2 x half> %b) {
-; GCN-LABEL: test_fminimum_v2f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f16_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <2 x half> @test_fminimum_v2f16_ss(<2 x half> inreg %a, <2 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, s0, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
+; GFX9-SDAG-LABEL: test_fminimum_v3f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v3f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v2
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fminimum_v3f16_vv:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v2
@@ -187,6 +462,49 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b
 }
 
 define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
+; GFX9-SDAG-LABEL: test_fminimum_v3f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, s0, v3
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v3f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s1, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fminimum_v3f16_ss:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, s0, s2
@@ -206,97 +524,384 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x ha
 }
 
 define amdgpu_ps <4 x half> @test_fminimum_v4f16(<4 x half> %a, <4 x half> %b) {
-; GCN-LABEL: test_fminimum_v4f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, v0, v2
-; GCN-NEXT:    v_pk_minimum_f16 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v6, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <4 x half> @test_fminimum_v4f16_ss(<4 x half> inreg %a, <4 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v4f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, s0, s2
-; GCN-NEXT:    v_pk_minimum_f16 v1, s1, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, s0, v4
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s1, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, s0, s2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, s1, s3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_f64_vv(double %a, double %b) {
-; GCN-LABEL: test_fminimum_f64_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_f64_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_f64_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f64_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.minimum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_f64_ss(double inreg %a, double inreg %b) {
-; GCN-LABEL: test_fminimum_f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.minimum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <4 x float> @test_fminimum_v2f64_ss(<2 x double> inreg %a, <2 x double> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[4:5]
-; GCN-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x double> @llvm.minimum.v2f64(<2 x double> %a, <2 x double> %b)
   %ret = bitcast <2 x double> %val to <4 x float>
   ret <4 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fminimum_v4f64(<4 x double> %a, <4 x double> %b) {
-; GCN-LABEL: test_fminimum_v4f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
-; GCN-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
-; GCN-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
-; GCN-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v16, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v18, v17, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v18, v9, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v18, v11, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v18, v13, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fminimum_v4f64_ss(<4 x double> inreg %a, <4 x double> inreg %b) {
-; GCN-LABEL: test_fminimum_v4f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[8:9]
-; GCN-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[10:11]
-; GCN-NEXT:    v_minimum_f64 v[4:5], s[4:5], s[12:13]
-; GCN-NEXT:    v_minimum_f64 v[6:7], s[6:7], s[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-SDAG-NEXT:    v_min_f64 v[6:7], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-SDAG-NEXT:    v_min_f64 v[8:9], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-GISEL-NEXT:    v_min_f64 v[6:7], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-GISEL-NEXT:    v_min_f64 v[8:9], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v10, v5, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, v7, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v10, v9, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], s[4:5], s[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], s[6:7], s[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fminimumi_f32_move_to_valu:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_minimum_f32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: fminimumi_f32_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fminimumi_f32_move_to_valu:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %v = call float @llvm.minimum.f32(float %a, float %b)
@@ -305,6 +910,23 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; GFX9-LABEL: fminimum_f16_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
 ; GFX12-SDAG-TRUE16-LABEL: fminimum_f16_move_to_valu:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
@@ -371,6 +993,40 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
   ret void
 }
 
+define amdgpu_ps float @test_fminimum_f32_ieee_on(float %a, float %b) #0 {
+; GFX9-LABEL: test_fminimum_f32_ieee_on:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ieee_on:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fminimum_f32_ieee_off(float %a, float %b) #1 {
+; GFX9-LABEL: test_fminimum_f32_ieee_off:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ieee_off:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
 declare float @llvm.minimum.f32(float, float)
 declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
 declare <3 x float> @llvm.minimum.v3f32(<3 x float>, <3 x float>)
@@ -383,3 +1039,6 @@ declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.minimum.f64(double, double)
 declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
+
+attributes #0 = { nounwind "amdgpu-ieee"="true" }
+attributes #1 = { nounwind "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 94afa88..9ebf6ae 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4666,21 +4666,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, 16, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_or_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 16
   %addr = inttoptr i64 %or to ptr addrspace(1)
@@ -4707,21 +4699,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
   %addr = inttoptr i64 %or to ptr addrspace(1)
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 9684712..2f9182e 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -1066,13 +1066,13 @@ define double @uitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
 ; GISEL-NEXT:    v_lshlrev_b64 v[8:9], 30, v[2:3]
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 2, v1
-; GISEL-NEXT:    v_or_b32_e32 v9, v5, v8
+; GISEL-NEXT:    v_or_b32_e32 v9, v8, v5
 ; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
 ; GISEL-NEXT:    v_lshlrev_b64 v[2:3], 29, v[2:3]
 ; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 3, v1
-; GISEL-NEXT:    v_or_b32_e32 v9, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v9, v2, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, v6
 ; GISEL-NEXT:  ; %bb.12: ; %Flow
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 1e4b633..fc36ed9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -45,27 +45,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
-; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
@@ -105,27 +96,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX942-GISEL:       ; %bb.0: ; %entry
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX942-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
-; GFX942-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s10, s7
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s9, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen nt
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
-; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX942-GISEL-NEXT:    s_load_dword s7, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen nt
@@ -168,29 +150,22 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
-; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
-; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen slc
+; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen slc
 ; GFX10-GISEL-NEXT:    s_clause 0x1
-; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -234,32 +209,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen slc dlc
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -303,32 +267,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -374,28 +327,19 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
-; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -436,28 +380,19 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL:       ; %bb.0: ; %entry
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX942-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
-; GFX942-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s10, s7
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s9, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
-; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX942-GISEL-NEXT:    s_load_dword s7, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -501,30 +436,23 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
-; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
-; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_clause 0x1
-; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
 ; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -569,33 +497,22 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -640,33 +557,22 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/BPF/BTF/binary-format.ll b/llvm/test/CodeGen/BPF/BTF/binary-format.ll
index 3b1be1a..fd09566 100644
--- a/llvm/test/CodeGen/BPF/BTF/binary-format.ll
+++ b/llvm/test/CodeGen/BPF/BTF/binary-format.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -gdwarf-5 -gembed-source -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f(i32 returned %a) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f(i32 returned %a) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 %a, !dbg !14
@@ -42,10 +42,7 @@ entry:
 ; CHECK-EB: 0x00000050 00000008 0000000f 00000018 00000410
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
index 2fb8d25..1672334 100644
--- a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
+++ b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
@@ -24,7 +24,7 @@
 @bpf_log = internal global ptr inttoptr (i64 999 to ptr), align 8, !dbg !17
 
 ; Function Attrs: nounwind
-define dso_local void @prog1() #0 !dbg !28 {
+define dso_local void @prog1() !dbg !28 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !31, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 0, i64 0), !dbg !36, !llvm.preserve.access.index !7
@@ -33,10 +33,10 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.btf.type.id(i32, i64) #1
+declare i64 @llvm.bpf.btf.type.id(i32, i64)
 
 ; Function Attrs: nounwind
-define dso_local void @prog2() #0 !dbg !38 {
+define dso_local void @prog2() !dbg !38 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !39, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 1, i64 0), !dbg !40, !llvm.preserve.access.index !6
@@ -45,7 +45,7 @@ entry:
 }
 
 ; Function Attrs: nounwind
-define dso_local void @prog3() #0 !dbg !42 {
+define dso_local void @prog3() !dbg !42 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !43, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 2, i64 1), !dbg !44, !llvm.preserve.access.index !11
@@ -96,9 +96,6 @@ entry:
 ; CHECK-NEXT:        .long   48
 ; CHECK-NEXT:        .long   7
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!24, !25, !26}
 !llvm.ident = !{!27}
diff --git a/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll b/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
index cc14a32b..1c2b1d1 100644
--- a/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
+++ b/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
@@ -10,7 +10,7 @@
 @g = dso_local local_unnamed_addr global i32 5, section "maps", align 4
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 {
+define dso_local i32 @test() local_unnamed_addr {
   %1 = load i32, ptr @g, align 4, !tbaa !2
   ret i32 %1
 }
@@ -18,8 +18,6 @@ define dso_local i32 @test() local_unnamed_addr #0 {
 ; CHECK-NOT:         .section        .BTF
 ; CHECK-NOT:         .section        .BTF.ext
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll b/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
index a855016..fa0aa5b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
@@ -10,7 +10,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind readonly
-define dso_local i64 @test(ptr readonly %skb) local_unnamed_addr #0 !dbg !13 {
+define dso_local i64 @test(ptr readonly %skb) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %skb, metadata !17, metadata !DIExpression()), !dbg !18
   %call = tail call i64 @llvm.bpf.load.byte(ptr %skb, i64 10), !dbg !19
@@ -54,13 +54,9 @@ entry:
 ; CHECK-NEXT:        .byte   0
 
 ; Function Attrs: nounwind readonly
-declare !dbg !4 i64 @llvm.bpf.load.byte(ptr, i64) #1
+declare !dbg !4 i64 @llvm.bpf.load.byte(ptr, i64)
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readonly }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll b/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
index b7cbb48f..9a31beb 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
@@ -8,9 +8,9 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
 
@@ -49,11 +49,7 @@ entry:
 ; CHECK:             .ascii  "char"                  # string offset=55
 ; CHECK:             .ascii  "global_func"           # string offset=60
 
-declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll b/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
index 299aa1d..c3f93ab 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
@@ -10,7 +10,7 @@
 @a = external dso_local local_unnamed_addr global i8, align 1
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !7 {
   %1 = load i8, ptr @a, align 1, !dbg !11, !tbaa !12
   %2 = sext i8 %1 to i32, !dbg !11
   ret i32 %2, !dbg !15
@@ -45,8 +45,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .ascii  "/home/yhs/work/tests/llvm/bug/test.c" # string offset=15
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
index d11addd..0ddd634 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
@@ -10,12 +10,12 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
+declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -69,10 +69,6 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=72
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
 !llvm.ident = !{!12}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
index 9e82295..fbfc03b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
@@ -10,12 +10,12 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
+declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -62,10 +62,6 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .ascii  "global_func"           # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
 !llvm.ident = !{!12}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
index 262abb3..0ba4732 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
@@ -10,9 +10,9 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
 
@@ -61,11 +61,7 @@ entry:
 ; CHECK-NEXT:        .ascii  "global_func"           # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
index b6e14fc..27793d1 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
@@ -13,9 +13,9 @@
 @ch = external dso_local local_unnamed_addr global i8, section "abc", align 1, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !19
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !19
   %0 = load i8, ptr @ch, align 1, !dbg !20, !tbaa !21
   %conv = sext i8 %0 to i32, !dbg !20
   %add = add nsw i32 %call, %conv, !dbg !24
@@ -84,11 +84,7 @@ entry:
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=75
 ; CHECK-NEXT:        .byte   0
 
-declare !dbg !6 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !6 dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
index 63ab578..ffec16b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
@@ -12,7 +12,7 @@
 
 @global = extern_weak dso_local local_unnamed_addr global %struct.t1, align 4, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test() local_unnamed_addr !dbg !15 {
 entry:
   %0 = load i32, ptr @global, align 4, !dbg !18, !tbaa !19
   ret i32 %0, !dbg !24
@@ -68,8 +68,6 @@ entry:
 ; CHECK-NEXT:        .ascii  "global"                # string offset=66
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
 !llvm.ident = !{!14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
index 3ecda4f..dfe5e5e 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
@@ -13,7 +13,7 @@
 @global = external dso_local local_unnamed_addr global %struct.t1, align 4, !dbg !0
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test() local_unnamed_addr !dbg !15 {
 entry:
   %0 = load i32, ptr @global, align 4, !dbg !18, !tbaa !19
   ret i32 %0, !dbg !24
@@ -69,8 +69,6 @@ entry:
 ; CHECK-NEXT:        .ascii  "global"                # string offset=66
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
 !llvm.ident = !{!14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
index 57ca18c..7d28987 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
@@ -12,15 +12,15 @@
 
 @ch = extern_weak dso_local local_unnamed_addr global i8, section "abc", align 1, !dbg !0
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !19
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !19
   %0 = load i8, ptr @ch, align 1, !dbg !20, !tbaa !21
   %conv = sext i8 %0 to i32, !dbg !20
   %add = add nsw i32 %call, %conv, !dbg !24
   ret i32 %add, !dbg !25
 }
-declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
+declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -84,10 +84,6 @@ declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=75
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
 !llvm.ident = !{!15}
diff --git a/llvm/test/CodeGen/BPF/BTF/filename.ll b/llvm/test/CodeGen/BPF/BTF/filename.ll
index ae08aea..0d8742fa 100644
--- a/llvm/test/CodeGen/BPF/BTF/filename.ll
+++ b/llvm/test/CodeGen/BPF/BTF/filename.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: norecurse nounwind readnone uwtable
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
   ret i32 0, !dbg !11
 }
 
@@ -63,8 +63,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   1038                    # Line 1 Col 14
 
-attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
index b700be9..f8c3de5 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
@@ -14,7 +14,7 @@
 @b1 = common dso_local local_unnamed_addr global %struct.t1 zeroinitializer, align 8, !dbg !6
 
 ; Function Attrs: nounwind readnone
-define dso_local void @f1(i32 %p2) local_unnamed_addr #0 !dbg !19 {
+define dso_local void @f1(i32 %p2) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata i32 %p2, metadata !21, metadata !DIExpression()), !dbg !22
   ret void, !dbg !23
@@ -95,10 +95,7 @@ entry:
 ; CHECK-NEXT:        .long   3091                    # Line 3 Col 19
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-non-void.ll b/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
index 2f562b2..745645d 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f1(i32 returned) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(i32 returned) local_unnamed_addr !dbg !7 {
   call void @llvm.dbg.value(metadata i32 %0, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 %0, !dbg !14
 }
@@ -73,10 +73,7 @@ define dso_local i32 @f1(i32 returned) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   1042                    # Line 1 Col 18
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-source.ll b/llvm/test/CodeGen/BPF/BTF/func-source.ll
index a485d2c..c305e83 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-source.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-source.ll
@@ -10,7 +10,7 @@
 ; correct reference to the lines in the string table.
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local void @f() local_unnamed_addr #0 !dbg !7 {
+define dso_local void @f() local_unnamed_addr !dbg !7 {
 entry:
   ret void, !dbg !10
 }
@@ -63,8 +63,6 @@ entry:
 ; CHECK-NEXT:        .long   18
 ; CHECK-NEXT:        .long   1040                    # Line 1 Col 16
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
index 2570536..388deeb 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
@@ -9,7 +9,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f(i32 returned %a) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f(i32 returned %a) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata i32 %a, metadata !14, metadata !DIExpression()), !dbg !15
   ret i32 %a, !dbg !16
@@ -85,12 +85,8 @@ entry:
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   3092                    # Line 3 Col 20
 
-
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll b/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
index f9439e6..380642c 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f1(i32) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(i32) local_unnamed_addr !dbg !7 {
   call void @llvm.dbg.value(metadata i32 %0, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 0, !dbg !14
 }
@@ -69,10 +69,7 @@ define dso_local i32 @f1(i32) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   1042                    # Line 1 Col 18
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-void.ll b/llvm/test/CodeGen/BPF/BTF/func-void.ll
index bf70b6a..9205700 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-void.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-void.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local void @f1() local_unnamed_addr #0 !dbg !7 {
+define dso_local void @f1() local_unnamed_addr !dbg !7 {
   ret void, !dbg !10
 }
 
@@ -57,8 +57,6 @@ define dso_local void @f1() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   1040                    # Line 1 Col 16
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
index 6ef7a30..5c797f7 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
@@ -21,16 +21,16 @@
 @__const.test.val = private unnamed_addr constant %struct.anon { [4 x i32] [i32 2, i32 3, i32 4, i32 5] }, align 4
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   %val = alloca %struct.anon, align 4
   call void @llvm.dbg.value(metadata ptr @.str, metadata !12, metadata !DIExpression()), !dbg !25
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %val) #4, !dbg !26
+  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %val), !dbg !26
   call void @llvm.dbg.declare(metadata ptr %val, metadata !16, metadata !DIExpression()), !dbg !27
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(16) %val, ptr nonnull align 4 dereferenceable(16) @__const.test.val, i64 16, i1 false), !dbg !27
-  tail call void @foo(ptr @.str) #4, !dbg !28
-  call void @foo(ptr nonnull %val) #4, !dbg !29
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %val) #4, !dbg !30
+  tail call void @foo(ptr @.str), !dbg !28
+  call void @foo(ptr nonnull %val), !dbg !29
+  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %val), !dbg !30
   ret i32 0, !dbg !31
 }
 
@@ -39,27 +39,21 @@ entry:
 ; CHECK-NOT:   BTF_KIND_DATASEC
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-declare !dbg !32 dso_local void @foo(ptr) local_unnamed_addr #3
+declare !dbg !32 dso_local void @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
index 0e183a5..243cd87 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
@@ -19,14 +19,14 @@
 @__const.test.val = private unnamed_addr constant %struct.anon { [4 x i32] [i32 2, i32 3, i32 4, i32 5], i8 4 }, align 4
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   %val = alloca %struct.anon, align 4
-  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %val) #4, !dbg !23
+  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %val), !dbg !23
   call void @llvm.dbg.declare(metadata ptr %val, metadata !12, metadata !DIExpression()), !dbg !24
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(20) %val, ptr nonnull align 4 dereferenceable(20) @__const.test.val, i64 20, i1 false), !dbg !24
-  call void @foo(ptr nonnull %val) #4, !dbg !25
-  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %val) #4, !dbg !26
+  call void @foo(ptr nonnull %val), !dbg !25
+  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %val), !dbg !26
   ret i32 0, !dbg !27
 }
 
@@ -38,24 +38,18 @@ entry:
 ; CHECK:             .ascii  ".rodata"                       # string offset=42
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-declare !dbg !28 dso_local void @foo(ptr) local_unnamed_addr #3
+declare !dbg !28 dso_local void @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var.ll b/llvm/test/CodeGen/BPF/BTF/local-var.ll
index dd79923..fa605d8 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(i8 signext) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @foo(i8 signext) local_unnamed_addr !dbg !7 {
   %2 = alloca i16, align 2
   call void @llvm.dbg.value(metadata i8 %0, metadata !13, metadata !DIExpression()), !dbg !17
   call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %2), !dbg !18
@@ -59,20 +59,16 @@ define dso_local i32 @foo(i8 signext) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .byte   0
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/pruning-const.ll b/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
index 8fef9c2..733815d 100644
--- a/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
+++ b/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
@@ -22,14 +22,14 @@
 %struct.s2 = type { %struct.tt }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !23
   ret i32 0, !dbg !24
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr #1 !dbg !25 {
+define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr !dbg !25 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = load i32, ptr %arg, align 4, !dbg !35, !tbaa !36
@@ -64,11 +64,7 @@ entry:
 ; CHECK:        .ascii  "m2"                    # string offset=72
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll b/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
index 4c8aa1f..727daea 100644
--- a/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
@@ -24,14 +24,14 @@
 %struct.s2 = type { %struct.tt }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !23, metadata !DIExpression()), !dbg !24
   ret i32 0, !dbg !25
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr #1 !dbg !26 {
+define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !34, metadata !DIExpression()), !dbg !35
   %0 = load i32, ptr %arg, align 4, !dbg !36, !tbaa !37
@@ -71,11 +71,7 @@ entry:
 ; CHECK:        .ascii  "m2"                    # string offset=81
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-func.ll b/llvm/test/CodeGen/BPF/BTF/static-func.ll
index fc79dbf..6506407 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-func.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-func.ll
@@ -9,18 +9,18 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test2() local_unnamed_addr #0 !dbg !12 {
+define dso_local i32 @test2() local_unnamed_addr !dbg !12 {
 entry:
   %call = tail call fastcc i32 @test1(), !dbg !13
   ret i32 %call, !dbg !14
 }
 ; Function Attrs: noinline nounwind
-define internal fastcc i32 @test1() unnamed_addr #1 !dbg !15 {
+define internal fastcc i32 @test1() unnamed_addr !dbg !15 {
 entry:
-  %call = tail call i32 @foo() #3, !dbg !16
+  %call = tail call i32 @foo(), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 dso_local i32 @foo() local_unnamed_addr #2
+declare !dbg !4 dso_local i32 @foo() local_unnamed_addr
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -67,11 +67,6 @@ declare !dbg !4 dso_local i32 @foo() local_unnamed_addr #2
 ; CHECK-NEXT:        .ascii  "foo"                   # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9, !10}
 !llvm.ident = !{!11}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll b/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
index 1827c97..fedec38 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
@@ -17,7 +17,7 @@
 @v4 = internal constant ptr null, align 8, !dbg !19
 
 ; Function Attrs: norecurse nounwind
-define dso_local i64 @foo() local_unnamed_addr #0 !dbg !27 {
+define dso_local i64 @foo() local_unnamed_addr !dbg !27 {
   %1 = load volatile ptr, ptr @v1, align 8, !dbg !29, !tbaa !30
   %2 = load volatile ptr, ptr @v2, align 8, !dbg !34, !tbaa !30
   %3 = ptrtoint ptr %1 to i64, !dbg !35
@@ -141,8 +141,6 @@ define dso_local i64 @foo() local_unnamed_addr #0 !dbg !27 {
 ; CHECK-NEXT:        .ascii  ".rodata"               # string offset=87
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!23, !24, !25}
 !llvm.ident = !{!26}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
index cc785b7..deef48a 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 3, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll b/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
index 2b62882..8f29a83 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 3, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".data"                 # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
index a4ae948..e16b467 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
@@ -14,7 +14,7 @@
 @a = internal constant i8 0, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !22, !tbaa !23
   %2 = sext i8 %1 to i32, !dbg !22
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !26, !tbaa !27
@@ -99,8 +99,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!18, !19, !20}
 !llvm.ident = !{!21}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll b/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
index a9d60ce..1ddd499 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
@@ -14,7 +14,7 @@
 @a = internal constant i8 0, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !22, !tbaa !23
   %2 = sext i8 %1 to i32, !dbg !22
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !26, !tbaa !27
@@ -99,8 +99,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".rodata"               # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!18, !19, !20}
 !llvm.ident = !{!21}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
index ac27b2b..0ff8f2e 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 0, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll b/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
index 28da203..fe9f508 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
@@ -15,7 +15,7 @@
 @sv = internal global { i32, i32, [10 x i8] } { i32 3, i32 4, [10 x i8] c"abcdefghi\00" }, align 4, !dbg !0
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !21 {
+define dso_local i32 @test() local_unnamed_addr !dbg !21 {
   %1 = load volatile i32, ptr @sv, align 4, !dbg !24, !tbaa !25
   ret i32 %1, !dbg !29
 }
@@ -104,8 +104,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !21 {
 ; CHECK-NEXT:        .ascii  ".data"                 # string offset=89
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17, !18, !19}
 !llvm.ident = !{!20}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var.ll b/llvm/test/CodeGen/BPF/BTF/static-var.ll
index 461bd27..f7710e3 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 0, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".bss"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll b/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
index 5b125ea..68d4be0 100644
--- a/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
@@ -15,7 +15,7 @@
 %struct.anon.0 = type { i64 }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @f1(ptr nocapture readnone %s1) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(ptr nocapture readnone %s1) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %s1, metadata !25, metadata !DIExpression()), !dbg !26
   ret i32 0, !dbg !27
@@ -65,12 +65,8 @@ entry:
 ; CHECK:             .ascii  "B1"                    # string offset=17
 ; CHECK:             .ascii  "long int"              # string offset=20
 
-
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll b/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
index 4b3b557..14cb8e0 100644
--- a/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
@@ -11,7 +11,7 @@
 
 @g = weak dso_local local_unnamed_addr global i8 2, align 1, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test() local_unnamed_addr !dbg !11 {
 entry:
   %0 = load i8, ptr @g, align 1, !dbg !15, !tbaa !16
   %conv = sext i8 %0 to i32, !dbg !15
@@ -37,9 +37,6 @@ entry:
 ; CHECK:             .byte   103                     # string offset=60
 ; CHECK:             .ascii  ".data"                 # string offset=62
 
-
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!7, !8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/CodeGen/BPF/BTF/weak-global.ll b/llvm/test/CodeGen/BPF/BTF/weak-global.ll
index ea0a887..5605e0b 100644
--- a/llvm/test/CodeGen/BPF/BTF/weak-global.ll
+++ b/llvm/test/CodeGen/BPF/BTF/weak-global.ll
@@ -11,7 +11,7 @@
 
 @g = weak dso_local local_unnamed_addr global i8 0, align 1, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test() local_unnamed_addr !dbg !11 {
 entry:
   %0 = load i8, ptr @g, align 1, !dbg !15, !tbaa !16
   %conv = sext i8 %0 to i32, !dbg !15
@@ -37,8 +37,6 @@ entry:
 ; CHECK:             .byte   103                     # string offset=60
 ; CHECK:             .ascii  ".bss"                  # string offset=62
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!7, !8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll b/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
index 23a4617..eecb993 100644
--- a/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
+++ b/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
@@ -13,7 +13,7 @@
 %struct.s1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(ptr %arg) #0 !dbg !7 {
+define dso_local i32 @foo(ptr %arg) !dbg !7 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -24,13 +24,13 @@ entry:
 }
 
 ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.btf.type.id(i32, i64) #2
+declare i64 @llvm.bpf.btf.type.id(i32, i64)
 
 ; Function Attrs: nounwind
-define dso_local i32 @bar(ptr %arg) #0 !dbg !25 {
+define dso_local i32 @bar(ptr %arg) !dbg !25 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -58,10 +58,6 @@ entry:
 ; CHECK-NEXT:        .long   26
 ; CHECK-NEXT:        .long   6
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
index 40a2432..0851f25 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 @c = common dso_local global %struct.b zeroinitializer, align 4, !dbg !0
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @f() local_unnamed_addr !dbg !15 {
 entry:
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr elementtype(%struct.b) nonnull @c, i32 1, i32 1), !dbg !18, !llvm.preserve.access.index !6
   %1 = tail call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 0), !dbg !19
@@ -40,13 +40,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
index b8b7a0b..51df39b 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
@@ -25,7 +25,7 @@ target triple = "bpfeb"
 %struct.s = type { i64, i32, i32, i32, i8, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 ; CHECK-ALU64-LABEL: test:
 ; CHECK-ALU64:       .Ltest$local:
 ; CHECK-ALU64-NEXT:    .type .Ltest$local,@function
@@ -122,17 +122,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
@@ -177,4 +173,3 @@ attributes #2 = { nounwind readnone speculatable }
 !36 = !DILocation(line: 14, column: 10, scope: !13)
 !37 = !DILocation(line: 13, column: 67, scope: !13)
 !38 = !DILocation(line: 12, column: 3, scope: !13)
-
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
index 4cf0a13..295c105 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
@@ -25,7 +25,7 @@ target triple = "bpfel"
 %struct.s = type { i64, i32, i32, i32, i8, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 ; CHECK-ALU64-LABEL: test:
 ; CHECK-ALU64:       .Ltest$local:
 ; CHECK-ALU64-NEXT:    .type .Ltest$local,@function
@@ -122,17 +122,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
index cdcd7e6..8f83404 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
@@ -26,7 +26,7 @@ target triple = "bpfeb"
 %struct.s = type <{ i8, i16 }>
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 4), !dbg !29, !llvm.preserve.access.index !18
@@ -70,17 +70,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
index dd7f1c7..1a7619a 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
@@ -26,7 +26,7 @@ target triple = "bpfel"
 %struct.s = type <{ i8, i16 }>
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 4), !dbg !29, !llvm.preserve.access.index !18
@@ -70,17 +70,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
index 126bd0a..5a98b05 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
@@ -13,7 +13,7 @@
 %struct.s1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(ptr %arg) #0 !dbg !7 {
+define dso_local i32 @foo(ptr %arg) !dbg !7 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -25,13 +25,13 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind
-define dso_local i32 @bar(ptr %arg) #0 !dbg !29 {
+define dso_local i32 @bar(ptr %arg) !dbg !29 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -60,10 +60,6 @@ entry:
 ; CHECK-NEXT:        .long   26
 ; CHECK-NEXT:        .long   0
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
index 90681d3c..00c3a6d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
@@ -17,7 +17,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   call void @llvm.dbg.value(metadata ptr null, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr elementtype(%struct.s1) null, i32 0, i32 0), !dbg !23, !llvm.preserve.access.index !8
@@ -40,17 +40,13 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr, i32 immarg, i32 immarg) #1
+declare ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64 immarg) #1
+declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64 immarg)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
index d6bed6c..7e2e8e6 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
@@ -15,12 +15,12 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr elementtype(%struct.s) %arg, i32 0, i32 2), !dbg !19, !llvm.preserve.access.index !11
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %0, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %1), !dbg !20
   ret i32 %call, !dbg !21
 }
 ; CHECK-LABEL: test
@@ -39,22 +39,16 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
index 525f38d..cb6674f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -85,20 +85,16 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
index 11235b5..2697201 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !32, !llvm.preserve.access.index !16
@@ -71,27 +71,23 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
index e3382d6..b7541f0 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !22
@@ -60,27 +60,23 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
index fda7592..0220567 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8, i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr readnone %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr readnone %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !23, metadata !DIExpression()), !dbg !24
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr elementtype(%struct.s1) %arg, i32 1, i32 1), !dbg !25, !llvm.preserve.access.index !17
@@ -41,17 +41,13 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
index 69872db3..0404deb 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.u1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !29, metadata !DIExpression()), !dbg !35
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !30, metadata !DIExpression()), !dbg !35
@@ -85,29 +85,25 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
index 90706e9..240083f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -59,24 +59,20 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
index 2297040..57dd5b7 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
@@ -19,7 +19,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !22
@@ -59,27 +59,23 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
index 503a26c..7caa667 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
@@ -23,7 +23,7 @@ target triple = "bpfeb"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -86,20 +86,16 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
index 0327f1a..c518573 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
@@ -23,7 +23,7 @@ target triple = "bpfel"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -86,20 +86,16 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
index 2a92d08..6bf29d4 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -60,24 +60,20 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
index 6e62bb3..441366f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -85,20 +85,16 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
index 77ea26a..7bc994d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -59,24 +59,20 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
index 556f69f..ebfecff 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { [5 x [5 x i8]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !32, metadata !DIExpression()), !dbg !35
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !36, !llvm.preserve.access.index !23
@@ -60,27 +60,23 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
index 2741050..d50701c 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.u1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !29, metadata !DIExpression()), !dbg !35
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !30, metadata !DIExpression()), !dbg !35
@@ -85,29 +85,25 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
index b71bbf3..312d40f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
@@ -25,7 +25,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !20 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !20 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !42, !llvm.preserve.access.index !24
@@ -76,24 +76,20 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !17, !18}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
index 5caea97..12a21c7 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
@@ -24,7 +24,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x i32], [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !29 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !29 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !43, metadata !DIExpression()), !dbg !46
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !47, !llvm.preserve.access.index !33
@@ -66,27 +66,23 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!25, !26, !27}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
index 8b95b1c4..13c7d1d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
@@ -15,11 +15,11 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 
@@ -39,19 +39,13 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
index 88658b6..8583322 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 @2 = private unnamed_addr constant [18 x i8] c"VAL10:-2147483648\00", align 1
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test() local_unnamed_addr !dbg !18 {
 entry:
   %0 = tail call i64 @llvm.bpf.preserve.enum.value(i32 0, ptr @0, i64 0), !dbg !23, !llvm.preserve.access.index !3
   %1 = tail call i64 @llvm.bpf.preserve.enum.value(i32 1, ptr @1, i64 1), !dbg !24, !llvm.preserve.access.index !3
@@ -81,10 +81,7 @@ entry:
 ; CHECK-NEXT:        .long   11
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.preserve.enum.value(i32, ptr, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i64 @llvm.bpf.preserve.enum.value(i32, ptr, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
index 0bdf954..6f316d9 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
@@ -17,7 +17,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = tail call i32 @llvm.bpf.preserve.type.info(i32 0, i64 0), !dbg !19, !llvm.preserve.access.index !8
   %1 = tail call i32 @llvm.bpf.preserve.type.info(i32 1, i64 0), !dbg !20, !llvm.preserve.access.index !21
@@ -59,10 +59,7 @@ entry:
 ; CHECK-NEXT:        .long   8
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
index ddd3711..d3aacc72 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
@@ -17,7 +17,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = tail call i32 @llvm.bpf.preserve.type.info(i32 0, i64 1), !dbg !19, !llvm.preserve.access.index !8
   %1 = tail call i32 @llvm.bpf.preserve.type.info(i32 1, i64 1), !dbg !20, !llvm.preserve.access.index !21
@@ -59,10 +59,7 @@ entry:
 ; CHECK-NEXT:        .long   9
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
index b2f8e48..ad4fc96 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
@@ -20,7 +20,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   call void @llvm.dbg.declare(metadata ptr undef, metadata !20, metadata !DIExpression()), !dbg !28
   call void @llvm.dbg.declare(metadata ptr undef, metadata !19, metadata !DIExpression()), !dbg !29
@@ -65,14 +65,10 @@ entry:
 ; CHECK-NEXT:        .long   9
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
index ef360929..e0217dd 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
@@ -15,11 +15,11 @@ target triple = "bpf"
 %union.u = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr %arg, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 ; CHECK-LABEL: test
@@ -38,19 +38,13 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
index 4c6ce1e..819ee31 100644
--- a/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
+++ b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 %struct.tt = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !16, metadata !DIExpression()), !dbg !17
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr elementtype(%struct.tt) %arg, i32 0, i32 0), !dbg !18, !llvm.preserve.access.index !12
@@ -26,14 +26,10 @@ entry:
 ; CHECK-NOT: llvm.tt:0:0$0:0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll b/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
index 9998c98..c3f8395 100644
--- a/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
+++ b/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
@@ -28,7 +28,7 @@ target triple = "bpf"
 %struct.data_t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local void @test(ptr readonly %args) local_unnamed_addr #0 !dbg !12 {
+define dso_local void @test(ptr readonly %args) local_unnamed_addr !dbg !12 {
 entry:
   %data = alloca i64, align 8
   call void @llvm.dbg.value(metadata ptr %args, metadata !22, metadata !DIExpression()), !dbg !29
@@ -36,7 +36,7 @@ entry:
   %1 = load i32, ptr %0, align 4, !dbg !30, !tbaa !31
   %and = and i32 %1, 65536, !dbg !36
   call void @llvm.dbg.value(metadata i32 %and, metadata !23, metadata !DIExpression()), !dbg !29
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %data) #5, !dbg !37
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %data), !dbg !37
   call void @llvm.dbg.declare(metadata ptr %data, metadata !24, metadata !DIExpression()), !dbg !38
   store i64 0, ptr %data, align 8, !dbg !38
   %tobool = icmp eq i32 %and, 0, !dbg !39
@@ -60,8 +60,8 @@ lor.end:                                          ; preds = %lor.end.critedge, %
   %5 = phi i32 [ %phitmp, %cond.false ], [ 1, %lor.end.critedge ]
   %d2 = getelementptr inbounds %struct.data_t, ptr %data, i64 0, i32 1, !dbg !49
   store i32 %5, ptr %d2, align 4, !dbg !50, !tbaa !51
-  call void @output(ptr nonnull %data) #5, !dbg !52
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %data) #5, !dbg !53
+  call void @output(ptr nonnull %data), !dbg !52
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %data), !dbg !53
   ret void, !dbg !53
 }
 
@@ -71,28 +71,21 @@ lor.end:                                          ; preds = %lor.end.critedge, %
 ; CHECK: r[[LOAD]] &= 32768
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.info_ts(ptr, i32 immarg, i32 immarg) #3
+declare ptr @llvm.preserve.struct.access.index.p0.p0.info_ts(ptr, i32 immarg, i32 immarg)
 
-declare !dbg !4 dso_local void @output(ptr) local_unnamed_addr #4
+declare !dbg !4 dso_local void @output(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { argmemonly nounwind willreturn }
-attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9, !10}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
index 5da2bbd..1ce453c 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
@@ -18,13 +18,13 @@ target triple = "bpf"
 %struct.t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !22, metadata !DIExpression()), !dbg !24
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !23, metadata !DIExpression()), !dbg !24
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg1, i32 1, i32 1), !dbg !25, !llvm.preserve.access.index !12
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr elementtype(%struct.t) %arg2, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !17
-  %call = tail call i32 @get_value(ptr %0, ptr %1) #4, !dbg !27
+  %call = tail call i32 @get_value(ptr %0, ptr %1), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -46,22 +46,16 @@ entry:
 ; CHECK-NEXT:        .long   [[ACCESS_STR]]
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr, ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr, ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
index 024ed04..0fdd704 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
@@ -24,19 +24,19 @@ target triple = "bpf"
 %struct.net_device = type opaque
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca ptr, align 8
   call void @llvm.dbg.value(metadata ptr %0, metadata !26, metadata !DIExpression()), !dbg !28
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2) #4, !dbg !29
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2), !dbg !29
   call void @llvm.dbg.value(metadata ptr null, metadata !27, metadata !DIExpression()), !dbg !28
   store ptr null, ptr %2, align 8, !dbg !30, !tbaa !31
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
-  %4 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 8, ptr %3) #4, !dbg !36
+  %4 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 8, ptr %3), !dbg !36
   %5 = load ptr, ptr %2, align 8, !dbg !37, !tbaa !31
   call void @llvm.dbg.value(metadata ptr %5, metadata !27, metadata !DIExpression()), !dbg !28
   %6 = icmp ne ptr %5, null, !dbg !38
   %7 = zext i1 %6 to i32, !dbg !38
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2) #4, !dbg !39
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2), !dbg !39
   ret i32 %7, !dbg !40
 }
 
@@ -122,22 +122,16 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
index e12221e..65859c86 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !22 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !22 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !32, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !34, !llvm.preserve.access.index !26
@@ -30,7 +30,7 @@ entry:
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %2, i32 0, i32 0), !dbg !34, !llvm.preserve.access.index !4
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %3, i32 1, i32 2), !dbg !34, !llvm.preserve.access.index !5
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %4, i32 1, i32 1), !dbg !34, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %5) #4, !dbg !35
+  %call = tail call i32 @get_value(ptr %5), !dbg !35
   ret i32 %call, !dbg !36
 }
 
@@ -60,13 +60,13 @@ entry:
 ; CHECK-NEXT:         .long   107
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
@@ -75,13 +75,7 @@ declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !19, !20}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
index 1764c9d..f42e7e6 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !24 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !24 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !34, metadata !DIExpression()), !dbg !35
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !28
@@ -31,7 +31,7 @@ entry:
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x %struct.v1]]) %3, i32 1, i32 2), !dbg !36, !llvm.preserve.access.index !5
   %5 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %4, i32 1, i32 3), !dbg !36, !llvm.preserve.access.index !18
   %6 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %5, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %6) #4, !dbg !37
+  %call = tail call i32 @get_value(ptr %6), !dbg !37
   ret i32 %call, !dbg !38
 }
 
@@ -62,13 +62,13 @@ entry:
 ; CHECK-NEXT:         .long   107
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
@@ -79,13 +79,7 @@ declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20, !21, !22}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
index bbff3f6..38b1c99 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
@@ -21,12 +21,12 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !14 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !18
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr elementtype(%struct.v1) %0, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !5
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !31
+  %call = tail call i32 @get_value(ptr %1), !dbg !31
   ret i32 %call, !dbg !32
 }
 
@@ -60,22 +60,16 @@ entry:
 ; CHECK-NEXT:        .long   [[ACCESS_STR]]
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11, !12}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
index bdc17e6..7730ee3a 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
@@ -24,12 +24,12 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !20
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr elementtype(%struct.v1) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !36
+  %call = tail call i32 @get_value(ptr %1), !dbg !36
   ret i32 %call, !dbg !37
 }
 
@@ -47,7 +47,6 @@ entry:
 ; CHECK:             .ascii  "0:1"                   # string offset=45
 ; CHECK:             .ascii  "v1"                    # string offset=91
 
-
 ; CHECK:             .long   16                      # FieldReloc
 ; CHECK-NEXT:        .long   39                      # Field reloc section string offset=39
 ; CHECK-NEXT:        .long   2
@@ -60,22 +59,16 @@ entry:
 ; CHECK-NEXT:        .long   45
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
index dea6e40..e5ef549 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
@@ -22,14 +22,14 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !19 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !24
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([40 x i32]) %0, i32 1, i32 4), !dbg !32, !llvm.preserve.access.index !11
   %2 = bitcast ptr %1 to ptr, !dbg !32
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %2, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %3) #4, !dbg !33
+  %call = tail call i32 @get_value(ptr %3), !dbg !33
   ret i32 %call, !dbg !34
 }
 
@@ -60,24 +60,18 @@ entry:
 ; CHECK-NEXT:        .long   118
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
index 98fdfde..7aeaed4 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
@@ -24,14 +24,14 @@ target triple = "bpf"
 %union.v1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !20
   %1 = bitcast ptr %0 to ptr, !dbg !35
   %2 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %1, i32 1), !dbg !35, !llvm.preserve.access.index !6
   %b = getelementptr inbounds %union.v1, ptr %2, i64 0, i32 0, !dbg !35
-  %call = tail call i32 @get_value(ptr %b) #4, !dbg !36
+  %call = tail call i32 @get_value(ptr %b), !dbg !36
   ret i32 %call, !dbg !37
 }
 
@@ -61,21 +61,15 @@ entry:
 ; CHECK-NEXT:        .long   45
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32) #2
+declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
index 7b63699..12c3936 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.v1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !19 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %arg, i32 1), !dbg !32, !llvm.preserve.access.index !24
@@ -31,7 +31,7 @@ entry:
   %2 = bitcast ptr %1 to ptr, !dbg !32
   %3 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %2, i32 1), !dbg !32, !llvm.preserve.access.index !6
   %b = getelementptr inbounds %union.v1, ptr %3, i64 0, i32 0, !dbg !32
-  %call = tail call i32 @get_value(ptr %b) #4, !dbg !33
+  %call = tail call i32 @get_value(ptr %b), !dbg !33
   ret i32 %call, !dbg !34
 }
 
@@ -62,24 +62,18 @@ entry:
 ; CHECK-NEXT:        .long   118
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32) #2
+declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
index 499e368..ee1f0e2 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
@@ -14,7 +14,7 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !20, metadata !DIExpression()), !dbg !21
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !22, !llvm.preserve.access.index !15
@@ -42,14 +42,10 @@ entry:
 ; CHECK-NEXT:  .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
index 2aadbdf..3d66435 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
@@ -14,7 +14,7 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local ptr @test(ptr readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local ptr @test(ptr readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !21, !llvm.preserve.access.index !13
@@ -42,14 +42,10 @@ entry:
 ; CHECK-NEXT:  .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
index 34ea050..cf75909 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
@@ -40,11 +40,11 @@ target triple = "bpfel"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !20 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !20 {
 entry:
   %ull = alloca i64, align 8
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !37
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ull) #5, !dbg !38
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ull), !dbg !38
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !39, !llvm.preserve.access.index !25
   %1 = tail call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 0), !dbg !40
   call void @llvm.dbg.value(metadata i32 %1, metadata !34, metadata !DIExpression()), !dbg !37
@@ -52,7 +52,7 @@ entry:
   call void @llvm.dbg.value(metadata i32 %2, metadata !35, metadata !DIExpression()), !dbg !37
   %idx.ext = zext i32 %1 to i64, !dbg !43
   %add.ptr = getelementptr i8, ptr %arg, i64 %idx.ext, !dbg !43
-  call void @bpf_probe_read(ptr nonnull %ull, i32 %2, ptr %add.ptr) #5, !dbg !44
+  call void @bpf_probe_read(ptr nonnull %ull, i32 %2, ptr %add.ptr), !dbg !44
   %3 = call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 4), !dbg !45
   call void @llvm.dbg.value(metadata i32 %3, metadata !36, metadata !DIExpression()), !dbg !37
   %4 = load i64, ptr %ull, align 8, !dbg !46, !tbaa !47
@@ -68,7 +68,7 @@ entry:
   %shr3 = lshr i64 %shl, %sh_prom1, !dbg !53
   %retval.0.in = select i1 %tobool, i64 %shr3, i64 %shr, !dbg !53
   %retval.0 = trunc i64 %retval.0.in to i32, !dbg !37
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ull) #5, !dbg !54
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ull), !dbg !54
   ret i32 %retval.0, !dbg !54
 }
 
@@ -114,28 +114,21 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #2
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
-declare dso_local void @bpf_probe_read(ptr, i32, ptr) local_unnamed_addr #3
+declare dso_local void @bpf_probe_read(ptr, i32, ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #4
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone speculatable willreturn }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !17, !18}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
index 01c5e69..d5b2d052 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
@@ -42,7 +42,7 @@ target triple = "bpfeb"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !26 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !42, !llvm.preserve.access.index !31
@@ -157,17 +157,13 @@ sw.epilog:                                        ; preds = %entry, %sw.bb9, %sw
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22, !23, !24}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
index d458d41..5076e79 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
@@ -42,7 +42,7 @@ target triple = "bpfel"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !26 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !42, !llvm.preserve.access.index !31
@@ -157,17 +157,13 @@ sw.epilog:                                        ; preds = %entry, %sw.bb9, %sw
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22, !23, !24}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
index 7657b78..2f42118 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
@@ -19,10 +19,10 @@ target triple = "bpf"
 @g = dso_local global %struct.v3 zeroinitializer, section "stats", align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) nonnull @g, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !7
-  %call = tail call i32 @get_value(ptr %0) #3, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 
@@ -45,15 +45,10 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
index bed14ab..f43df76 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
@@ -19,12 +19,12 @@ target triple = "bpf"
 @g = dso_local global [4 x [5 x %struct.v3]] zeroinitializer, section "stats", align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !23 {
+define dso_local i32 @test() local_unnamed_addr !dbg !23 {
 entry:
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype([4 x [5 x %struct.v3]]) nonnull @g, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !6
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype([5 x %struct.v3]) %0, i32 1, i32 2), !dbg !26, !llvm.preserve.access.index !16
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %1, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %2) #3, !dbg !27
+  %call = tail call i32 @get_value(ptr %2), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -47,21 +47,15 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!19, !20, !21}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
index 49b89e2..5bc2bf9 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
@@ -19,11 +19,11 @@ target triple = "bpf"
 @g = dso_local local_unnamed_addr global ptr null, section "stats", align 8, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = load ptr, ptr @g, align 8, !dbg !20, !tbaa !21
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !20, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %1) #3, !dbg !25
+  %call = tail call i32 @get_value(ptr %1), !dbg !25
   ret i32 %call, !dbg !26
 }
 
@@ -45,15 +45,10 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
index 4ff170cf..983383c 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
@@ -13,11 +13,11 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !10 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !10 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !14, metadata !DIExpression()), !dbg !15
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype(i32) %arg, i32 0, i32 4), !dbg !16, !llvm.preserve.access.index !4
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !17
+  %call = tail call i32 @get_value(ptr %0), !dbg !17
   ret i32 %call, !dbg !18
 }
 
@@ -26,19 +26,13 @@ entry:
 ; CHECK:             .section        .BTF.ext,"",@progbits
 ; CHECK-NOT:         .long   16                      # FieldReloc
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!6, !7, !8}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
index e5f86c2..c67d57f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
@@ -29,7 +29,7 @@ target triple = "bpf"
 %struct.t1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local void @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local void @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr elementtype(%struct.r1) %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !11
@@ -38,7 +38,7 @@ entry:
   call void @llvm.dbg.value(metadata ptr %1, metadata !25, metadata !DIExpression()), !dbg !29
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr elementtype(%struct.t1) %1, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !17
   call void @llvm.dbg.value(metadata ptr %2, metadata !27, metadata !DIExpression()), !dbg !29
-  tail call void @test1(ptr %0, ptr %1, ptr %2) #4, !dbg !36
+  tail call void @test1(ptr %0, ptr %1, ptr %2), !dbg !36
   ret void, !dbg !37
 }
 
@@ -67,24 +67,18 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.t1s.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.t1s.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr, i32, i32)
 
-declare dso_local void @test1(ptr, ptr, ptr) local_unnamed_addr #2
+declare dso_local void @test1(ptr, ptr, ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
index 8ca3ef5..7ffb4de 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
@@ -17,14 +17,14 @@ target triple = "bpf"
 %struct.v3 = type { i32, [4 x [4 x i32]] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !21 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !21 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !25, metadata !DIExpression()), !dbg !26
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !27, !llvm.preserve.access.index !4
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !27, !llvm.preserve.access.index !6
   %2 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x i32]]) %1, i32 1, i32 2), !dbg !27, !llvm.preserve.access.index !11
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x i32]) %2, i32 1, i32 3), !dbg !27, !llvm.preserve.access.index !15
-  %call = tail call i32 @get_value(ptr %3) #4, !dbg !28
+  %call = tail call i32 @get_value(ptr %3), !dbg !28
   ret i32 %call, !dbg !29
 }
 
@@ -46,27 +46,21 @@ entry:
 ; CHECK-NEXT:         .long   58
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17, !18, !19}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
index b2ba5a8..55bb7c58 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
@@ -17,7 +17,7 @@ target triple = "bpf"
 %struct.v3 = type { i32, [4 x [4 x [4 x i32]]] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !23 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !23 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !29, !llvm.preserve.access.index !4
@@ -25,7 +25,7 @@ entry:
   %2 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x [4 x i32]]]) %1, i32 1, i32 2), !dbg !29, !llvm.preserve.access.index !11
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x i32]]) %2, i32 1, i32 3), !dbg !29, !llvm.preserve.access.index !15
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x i32]) %3, i32 1, i32 2), !dbg !29, !llvm.preserve.access.index !17
-  %call = tail call i32 @get_value(ptr %4) #4, !dbg !30
+  %call = tail call i32 @get_value(ptr %4), !dbg !30
   ret i32 %call, !dbg !31
 }
 
@@ -47,29 +47,23 @@ entry:
 ; CHECK-NEXT:        .long   58
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !20, !21}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
index e00bbb8..a5b4604 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
@@ -28,16 +28,16 @@ target triple = "bpf"
 %struct.net_device = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !28, metadata !DIExpression()), !dbg !30
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !31
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !31
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr elementtype(%struct.net_device) %3, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !23
-  %5 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %4) #4, !dbg !33
+  %5 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %4), !dbg !33
   %6 = load i32, ptr %2, align 4, !dbg !34, !tbaa !35
   call void @llvm.dbg.value(metadata i32 %6, metadata !29, metadata !DIExpression()), !dbg !30
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !39
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !39
   ret i32 %6, !dbg !40
 }
 
@@ -130,25 +130,19 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
index b4d1844..ffd77ed 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
@@ -16,11 +16,11 @@ target triple = "bpf"
 %struct.v3 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !21, !llvm.preserve.access.index !4
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !22
+  %call = tail call i32 @get_value(ptr %0), !dbg !22
   ret i32 %call, !dbg !23
 }
 
@@ -42,19 +42,13 @@ entry:
 ; CHECK-NEXT:         .long   32
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
index 87b88bc..cb0aff3 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
@@ -16,12 +16,12 @@ target triple = "bpf"
 %struct.v3 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !21, !llvm.preserve.access.index !4
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !21, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !22
+  %call = tail call i32 @get_value(ptr %1), !dbg !22
   ret i32 %call, !dbg !23
 }
 
@@ -42,22 +42,16 @@ entry:
 ; CHECK-NEXT:         .long   32
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
index 8ebbfea..2081b3f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
@@ -27,17 +27,17 @@ target triple = "bpf"
 %struct.anon = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !31, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !34
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !34
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr elementtype([10 x %struct.anon]) %3, i32 1, i32 5), !dbg !35, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr elementtype(%struct.anon) %4, i32 0, i32 0), !dbg !35, !llvm.preserve.access.index !24
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !36
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !36
   %7 = load i32, ptr %2, align 4, !dbg !37, !tbaa !38
   call void @llvm.dbg.value(metadata i32 %7, metadata !32, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !42
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !42
   ret i32 %7, !dbg !43
 }
 
@@ -140,28 +140,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
index 64ec250..4e51366 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
@@ -28,17 +28,17 @@ target triple = "bpf"
 %struct.net_device = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !31, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !34
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !34
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr elementtype([10 x %struct.net_device]) %3, i32 1, i32 5), !dbg !35, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr elementtype(%struct.net_device) %4, i32 0, i32 0), !dbg !35, !llvm.preserve.access.index !24
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !36
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !36
   %7 = load i32, ptr %2, align 4, !dbg !37, !tbaa !38
   call void @llvm.dbg.value(metadata i32 %7, metadata !32, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !42
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !42
   ret i32 %7, !dbg !43
 }
 
@@ -143,28 +143,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
index ed462e1..eb0620d 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
@@ -20,12 +20,12 @@ target triple = "bpf"
 %struct.__s = type { [7 x i32] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !24, metadata !DIExpression()), !dbg !25
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr elementtype(%struct.__s) %arg, i32 0, i32 0), !dbg !26, !llvm.preserve.access.index !13
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([7 x i32]) %0, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !19
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !27
+  %call = tail call i32 @get_value(ptr %1), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -48,22 +48,16 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
index 6b806ae..c4edda1 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
@@ -18,7 +18,7 @@ target triple = "bpf"
 %struct.__t = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !18, metadata !DIExpression()), !dbg !19
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr elementtype(%struct.__t) %arg, i32 0, i32 0), !dbg !20, !llvm.preserve.access.index !4
@@ -50,14 +50,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
index c2b5a11..f8cf253 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
@@ -20,11 +20,11 @@ target triple = "bpf"
 %struct.__s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr elementtype(%struct.__s) %arg, i32 1, i32 1), !dbg !23, !llvm.preserve.access.index !14
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !24
+  %call = tail call i32 @get_value(ptr %0), !dbg !24
   ret i32 %call, !dbg !25
 }
 
@@ -47,19 +47,13 @@ entry:
 ; CHECK-NEXT:   .long   [[ACCESS_STR]]
 ; CHECK-NEXT:   .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
index a63b7e7..0fe7c1f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
@@ -18,7 +18,7 @@ target triple = "bpf"
 %union.__t = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !18, metadata !DIExpression()), !dbg !19
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr %arg, i32 0), !dbg !20, !llvm.preserve.access.index !4
@@ -50,14 +50,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
index 4b3d178..aa8705d 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
@@ -20,11 +20,11 @@ target triple = "bpf"
 %union.__s = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr %arg, i32 1), !dbg !23, !llvm.preserve.access.index !14
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !24
+  %call = tail call i32 @get_value(ptr %0), !dbg !24
   ret i32 %call, !dbg !25
 }
 
@@ -47,19 +47,13 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
index e757327..5195d17 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
@@ -24,13 +24,13 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr elementtype([7 x %union.u]) %arg, i32 0, i32 1), !dbg !30, !llvm.preserve.access.index !14
   %1 = tail call ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr %0, i32 1), !dbg !30, !llvm.preserve.access.index !16
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %1, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !20
-  %call = tail call i32 @get_value(ptr %2) #4, !dbg !31
+  %call = tail call i32 @get_value(ptr %2), !dbg !31
   ret i32 %call, !dbg !32
 }
 
@@ -53,25 +53,19 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR:[0-9]+]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
index 824eba9a..e156999 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
@@ -31,17 +31,17 @@ target triple = "bpf"
 %union.anon = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !32, metadata !DIExpression()), !dbg !34
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !35
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !35
   %3 = tail call ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr %0, i32 1), !dbg !36, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr elementtype(%struct.anon) %3, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr %4, i32 0), !dbg !36, !llvm.preserve.access.index !27
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !37
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !37
   %7 = load i32, ptr %2, align 4, !dbg !38, !tbaa !39
   call void @llvm.dbg.value(metadata i32 %7, metadata !33, metadata !DIExpression()), !dbg !34
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !43
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !43
   ret i32 %7, !dbg !44
 }
 
@@ -145,28 +145,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/store-addr.ll b/llvm/test/CodeGen/BPF/CORE/store-addr.ll
index 33bbd01..2c8a0c4 100644
--- a/llvm/test/CodeGen/BPF/CORE/store-addr.ll
+++ b/llvm/test/CodeGen/BPF/CORE/store-addr.ll
@@ -22,17 +22,17 @@ target triple = "bpf"
 %struct.t = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !14 {
 entry:
   %param = alloca [1 x i64], align 8
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !27
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %param) #5, !dbg !28
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %param), !dbg !28
   call void @llvm.dbg.declare(metadata ptr %param, metadata !23, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr elementtype(%struct.t) %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !18
   %1 = ptrtoint ptr %0 to i64, !dbg !31
   store i64 %1, ptr %param, align 8, !dbg !33, !tbaa !34
-  %call = call i32 @foo(ptr nonnull %param) #5, !dbg !38
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %param) #5, !dbg !39
+  %call = call i32 @foo(ptr nonnull %param), !dbg !38
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %param), !dbg !39
   ret i32 %call, !dbg !40
 }
 
@@ -41,28 +41,21 @@ entry:
 ; CHECK:  *(u64 *)(r10 - 8) = r1
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32, i32) #3
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32, i32)
 
-declare !dbg !5 dso_local i32 @foo(ptr) local_unnamed_addr #4
+declare !dbg !5 dso_local i32 @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind }
-attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11, !12}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll b/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
index 8a4b37d..09ca422 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
@@ -20,12 +20,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %retval = alloca i32, align 4
   %ret = alloca i32, align 4
   %cleanup.dest.slot = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !2
   %0 = load i32, ptr %ret, align 4, !tbaa !2
@@ -62,25 +62,20 @@ if.end:                                           ; preds = %lor.lhs.false
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end, %if.then
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   %3 = load i32, ptr %retval, align 4
   ret i32 %3
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
-declare dso_local i32 @bar(i32) #2
+declare dso_local i32 @bar(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll b/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
index ad157fe..bbda062 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
@@ -18,12 +18,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %retval = alloca i32, align 4
   %ret = alloca i32, align 4
   %cleanup.dest.slot = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !2
   %0 = load i32, ptr %ret, align 4, !tbaa !2
@@ -65,25 +65,20 @@ if.end3:                                          ; preds = %if.end
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end3, %if.then2, %if.then
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   %3 = load i32, ptr %retval, align 4
   ret i32 %3
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
-declare dso_local i32 @bar(i32) #2
+declare dso_local i32 @bar(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll b/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
index d118fa0..d34d652 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
@@ -15,12 +15,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local ptr @test(ptr %p) #0 {
+define dso_local ptr @test(ptr %p) {
 entry:
   %p.addr = alloca ptr, align 8
   %ret = alloca i64, align 8
   store ptr %p, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.start.p0(i64 8, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 8, ptr %ret)
   %call = call i64 @foo()
   store i64 %call, ptr %ret, align 8, !tbaa !6
   %0 = load i64, ptr %ret, align 8, !tbaa !6
@@ -36,7 +36,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %3 = load ptr, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.end.p0(i64 8, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 8, ptr %ret)
   ret ptr %3
 }
 ; CHECK-COMMON:  [[REG6:r[0-9]+]] = r1
@@ -57,17 +57,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-COMMON:  exit
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i64 @foo(...) #2
+declare dso_local i64 @foo(...)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll b/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
index 218fa5d..5f3fa94 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
@@ -15,12 +15,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local ptr @test(ptr %p) #0 {
+define dso_local ptr @test(ptr %p) {
 entry:
   %p.addr = alloca ptr, align 8
   %ret = alloca i32, align 4
   store ptr %p, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !6
   %0 = load i32, ptr %ret, align 4, !tbaa !6
@@ -37,7 +37,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %3 = load ptr, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   ret ptr %3
 }
 
@@ -66,17 +66,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-COMMON:  exit
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/callx.ll b/llvm/test/CodeGen/BPF/callx.ll
index d83e0f6..e027c1f 100644
--- a/llvm/test/CodeGen/BPF/callx.ll
+++ b/llvm/test/CodeGen/BPF/callx.ll
@@ -3,16 +3,13 @@
 ;   int test(int (*f)(void)) { return f(); }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr nocapture %f) local_unnamed_addr #0 {
+define dso_local i32 @test(ptr nocapture %f) local_unnamed_addr {
 entry:
-  %call = tail call i32 %f() #1
+  %call = tail call i32 %f()
 ; CHECK: callx r{{[0-9]+}}
   ret i32 %call
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/dwarfdump.ll b/llvm/test/CodeGen/BPF/dwarfdump.ll
index a3a6b52..d50c025 100644
--- a/llvm/test/CodeGen/BPF/dwarfdump.ll
+++ b/llvm/test/CodeGen/BPF/dwarfdump.ll
@@ -10,7 +10,7 @@ target triple = "bpf"
 @testprog.myvar_c = internal unnamed_addr global i32 0, align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define i32 @testprog(i32, i32) local_unnamed_addr #0 !dbg !2 {
+define i32 @testprog(i32, i32) local_unnamed_addr !dbg !2 {
   tail call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !11, metadata !16), !dbg !17
   tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !12, metadata !16), !dbg !18
   %3 = load i32, ptr @testprog.myvar_c, align 4, !dbg !19, !tbaa !20
@@ -21,10 +21,7 @@ define i32 @testprog(i32, i32) local_unnamed_addr #0 !dbg !2 {
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/CodeGen/BPF/i128.ll b/llvm/test/CodeGen/BPF/i128.ll
index a966e3e..3c94e0c 100644
--- a/llvm/test/CodeGen/BPF/i128.ll
+++ b/llvm/test/CodeGen/BPF/i128.ll
@@ -19,14 +19,14 @@
 %struct.ipv6_key_t = type { i32, i128, i16 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(i32 %pid) local_unnamed_addr #0 {
+define dso_local i32 @test(i32 %pid) local_unnamed_addr {
 entry:
   %ipv6_key = alloca %struct.ipv6_key_t, align 16
-  call void @llvm.lifetime.start.p0(i64 48, ptr nonnull %ipv6_key) #4
+  call void @llvm.lifetime.start.p0(i64 48, ptr nonnull %ipv6_key)
   call void @llvm.memset.p0.i64(ptr nonnull align 16 dereferenceable(48) %ipv6_key, i8 0, i64 48, i1 false)
   store i32 %pid, ptr %ipv6_key, align 16, !tbaa !2
-  call void @test1(ptr nonnull %ipv6_key) #4
-  call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %ipv6_key) #4
+  call void @test1(ptr nonnull %ipv6_key)
+  call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %ipv6_key)
   ret i32 0
 }
 
@@ -35,21 +35,15 @@ entry:
 ; CHECK:       *(u32 *)(r10 - 48) = r{{[0-9]+}}
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind willreturn writeonly
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
 
-declare dso_local void @test1(ptr) local_unnamed_addr #3
+declare dso_local void @test1(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { argmemonly nounwind willreturn writeonly }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/is_trunc_free.ll b/llvm/test/CodeGen/BPF/is_trunc_free.ll
index fe00731..6bb8568 100644
--- a/llvm/test/CodeGen/BPF/is_trunc_free.ll
+++ b/llvm/test/CodeGen/BPF/is_trunc_free.ll
@@ -29,7 +29,7 @@
 %struct.env_t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %skb) local_unnamed_addr #0 {
+define dso_local i32 @test(ptr %skb) local_unnamed_addr {
 entry:
   %data_end1 = getelementptr inbounds %struct.env_t, ptr %skb, i64 0, i32 1
   %0 = load i32, ptr %data_end1, align 4, !tbaa !2
@@ -49,7 +49,7 @@ if.end10:                                         ; preds = %entry
   %sub.ptr.lhs.cast = ptrtoint ptr %add.ptr to i64
   %4 = trunc i64 %sub.ptr.lhs.cast to i32
   %conv13 = sub i32 %4, %2
-  %call = tail call i32 @work(ptr nonnull %skb, i32 %conv13) #2
+  %call = tail call i32 @work(ptr nonnull %skb, i32 %conv13)
   br label %cleanup
 
 cleanup:                                          ; preds = %entry, %if.end10
@@ -59,11 +59,7 @@ cleanup:                                          ; preds = %entry, %if.end10
 
 ; CHECK: w{{[0-9]+}} = *(u32 *)(r{{[0-9]+}} + 0)
 
-declare dso_local i32 @work(ptr, i32) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare dso_local i32 @work(ptr, i32) local_unnamed_addr
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/is_zext_free.ll b/llvm/test/CodeGen/BPF/is_zext_free.ll
index 4b81a90..3b794a9 100644
--- a/llvm/test/CodeGen/BPF/is_zext_free.ll
+++ b/llvm/test/CodeGen/BPF/is_zext_free.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -emit-llvm -S test.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test(i64 %x, i64 %y) local_unnamed_addr #0 {
+define dso_local i32 @test(i64 %x, i64 %y) local_unnamed_addr {
 entry:
   %and = and i64 %y, %x
   %conv = trunc i64 %and to i32
@@ -17,8 +17,6 @@ entry:
 ; CHECK: r[[REG1:[0-9]+]] = r{{[0-9]+}}
 ; CHECK: w[[REG1]] &= w{{[0-9]+}}
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/objdump_two_funcs.ll b/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
index fb1043c..8158a1b 100644
--- a/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
+++ b/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
@@ -14,7 +14,7 @@
 ;   clang -target bpf -S -gdwarf-5 -gembed-source -emit-llvm -g -O2 bug.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @func1(i32 %a) local_unnamed_addr #0 section "s1" !dbg !7 {
+define dso_local i32 @func1(i32 %a) local_unnamed_addr section "s1" !dbg !7 {
 entry:
 ; CHECK: <func1>:
   call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !13
@@ -24,7 +24,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @func2(i32 %a) local_unnamed_addr #0 section "s2" !dbg !16 {
+define dso_local i32 @func2(i32 %a) local_unnamed_addr section "s2" !dbg !16 {
 entry:
 ; CHECK: <func2>:
   call void @llvm.dbg.value(metadata i32 %a, metadata !18, metadata !DIExpression()), !dbg !19
@@ -35,10 +35,7 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/optnone-1.ll b/llvm/test/CodeGen/BPF/optnone-1.ll
index 68046bf..f45c85b 100644
--- a/llvm/test/CodeGen/BPF/optnone-1.ll
+++ b/llvm/test/CodeGen/BPF/optnone-1.ll
@@ -5,7 +5,7 @@
 ;   clang -target bpf -g -S -emit-llvm test.c
 
 ; Function Attrs: noinline nounwind optnone
-define dso_local i32 @test(i32 %a, i32 %b) #0 !dbg !7 {
+define dso_local i32 @test(i32 %a, i32 %b) !dbg !7 {
 entry:
   %a.addr = alloca i32, align 4
   %b.addr = alloca i32, align 4
@@ -22,10 +22,7 @@ entry:
 ; CHECK-LABEL: test
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable}
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/reloc-btf-2.ll b/llvm/test/CodeGen/BPF/reloc-btf-2.ll
index 7398257..430abc7 100644
--- a/llvm/test/CodeGen/BPF/reloc-btf-2.ll
+++ b/llvm/test/CodeGen/BPF/reloc-btf-2.ll
@@ -14,7 +14,7 @@
 @s = internal global i32 0, align 4, !dbg !6
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test() local_unnamed_addr !dbg !14 {
   %1 = load i32, ptr @g, align 4, !dbg !17, !tbaa !18
   %2 = load volatile i32, ptr @s, align 4, !dbg !22, !tbaa !18
   %3 = add nsw i32 %2, %1, !dbg !23
@@ -27,8 +27,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !14 {
 ; CHECK-RELOC: R_BPF_64_NODYLD32 g
 ; CHECK-RELOC: RELOCATION RECORDS FOR [.BTF.ext]:
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!10, !11, !12}
 !llvm.ident = !{!13}
diff --git a/llvm/test/CodeGen/BPF/reloc-btf.ll b/llvm/test/CodeGen/BPF/reloc-btf.ll
index b9f6e3a..875bfa1 100644
--- a/llvm/test/CodeGen/BPF/reloc-btf.ll
+++ b/llvm/test/CodeGen/BPF/reloc-btf.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=bpfel -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   ret i32 0, !dbg !11
 }
@@ -13,8 +13,6 @@ entry:
 ; CHECK-RELOC: RELOCATION RECORDS FOR [.BTF.ext]:
 ; CHECK-RELOC: R_BPF_64_NODYLD32 .text
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/simplifycfg.ll b/llvm/test/CodeGen/BPF/simplifycfg.ll
index fcd2321..d53b51a 100644
--- a/llvm/test/CodeGen/BPF/simplifycfg.ll
+++ b/llvm/test/CodeGen/BPF/simplifycfg.ll
@@ -38,15 +38,15 @@ target triple = "bpf"
 %struct.FrameData = type { ptr }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %frame_ptr = alloca ptr, align 8
   %frame = alloca %struct.FrameData, align 8
   %i = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 8, ptr %frame_ptr) #3
-  call void @llvm.lifetime.start.p0(i64 8, ptr %frame) #3
+  call void @llvm.lifetime.start.p0(i64 8, ptr %frame_ptr)
+  call void @llvm.lifetime.start.p0(i64 8, ptr %frame)
   call void @get_frame_ptr(ptr %frame_ptr)
-  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !2
   br label %for.cond
 
@@ -61,7 +61,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i)
   br label %for.end
 
 for.body:                                         ; preds = %for.cond
@@ -93,25 +93,20 @@ for.end:                                          ; preds = %for.cond.cleanup
   %5 = load ptr, ptr %frame_ptr, align 8, !tbaa !6
   %cmp2 = icmp eq ptr %5, null
   %conv = zext i1 %cmp2 to i32
-  call void @llvm.lifetime.end.p0(i64 8, ptr %frame) #3
-  call void @llvm.lifetime.end.p0(i64 8, ptr %frame_ptr) #3
+  call void @llvm.lifetime.end.p0(i64 8, ptr %frame)
+  call void @llvm.lifetime.end.p0(i64 8, ptr %frame_ptr)
   ret i32 %conv
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local void @get_frame_ptr(ptr) #2
+declare dso_local void @get_frame_ptr(ptr)
 
-declare dso_local i32 @get_data(ptr, ptr) #2
+declare dso_local i32 @get_data(ptr, ptr)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/warn-stack.ll b/llvm/test/CodeGen/BPF/warn-stack.ll
index 58a6e4c..5e62a91 100644
--- a/llvm/test/CodeGen/BPF/warn-stack.ll
+++ b/llvm/test/CodeGen/BPF/warn-stack.ll
@@ -1,43 +1,37 @@
 ; RUN: not llc -mtriple=bpfel < %s 2>&1 >/dev/null | FileCheck %s
 
 ;; CHECK-NOT: nowarn
-define void @nowarn() local_unnamed_addr #0 !dbg !6 {
+define void @nowarn() local_unnamed_addr !dbg !6 {
   %1 = alloca [504 x i8], align 1
-  call void @llvm.lifetime.start.p0(i64 504, ptr nonnull %1) #4, !dbg !15
+  call void @llvm.lifetime.start.p0(i64 504, ptr nonnull %1), !dbg !15
   tail call void @llvm.dbg.declare(metadata ptr %1, metadata !10, metadata !16), !dbg !17
-  call void @doit(ptr nonnull %1) #4, !dbg !18
-  call void @llvm.lifetime.end.p0(i64 504, ptr nonnull %1) #4, !dbg !19
+  call void @doit(ptr nonnull %1), !dbg !18
+  call void @llvm.lifetime.end.p0(i64 504, ptr nonnull %1), !dbg !19
   ret void, !dbg !19
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @doit(ptr) local_unnamed_addr #3
+declare void @doit(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; CHECK: error: warn_stack.c
 ; CHECK: BPF stack limit
-define void @warn() local_unnamed_addr #0 !dbg !20 {
+define void @warn() local_unnamed_addr !dbg !20 {
   %1 = alloca [512 x i8], align 1
-  call void @llvm.lifetime.start.p0(i64 512, ptr nonnull %1) #4, !dbg !26
+  call void @llvm.lifetime.start.p0(i64 512, ptr nonnull %1), !dbg !26
   tail call void @llvm.dbg.declare(metadata ptr %1, metadata !22, metadata !16), !dbg !27
-  call void @doit(ptr nonnull %1) #4, !dbg !28
-  call void @llvm.lifetime.end.p0(i64 512, ptr nonnull %1) #4, !dbg !29
+  call void @doit(ptr nonnull %1), !dbg !28
+  call void @llvm.lifetime.end.p0(i64 512, ptr nonnull %1), !dbg !29
   ret void, !dbg !29
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
diff --git a/llvm/test/CodeGen/BPF/xadd.ll b/llvm/test/CodeGen/BPF/xadd.ll
index a3ec323..8d232ffb 100644
--- a/llvm/test/CodeGen/BPF/xadd.ll
+++ b/llvm/test/CodeGen/BPF/xadd.ll
@@ -17,7 +17,7 @@ target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128"
 target triple = "bpf"
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %ptr, metadata !13, metadata !DIExpression()), !dbg !15
   %0 = atomicrmw add ptr %ptr, i32 4 seq_cst, !dbg !16
@@ -28,10 +28,7 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
new file mode 100644
index 0000000..4385da3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
@@ -0,0 +1,63 @@
+; Make sure we do not assert for the cases we do not handle.
+; RUN: llc -march=hexagon -mattr=+hvx,+hvx-length128b,+hvxv75,+v75,-long-calls < %s | FileCheck %s
+
+; Mainly make sure we do not core dump.
+; CHECK-NOT: scatter
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: write, inaccessiblemem: readwrite)
+define dso_local void @foo(ptr noundef writeonly captures(none) %cptr, i32 noundef %T, i32 noundef %W) local_unnamed_addr #0 {
+entry:
+  %invariant.gep11 = getelementptr i8, ptr %cptr, i32 0
+  %invariant.gep13 = getelementptr i8, ptr %invariant.gep11, i32 0
+  %cmp.not15 = icmp ugt i32 8, %T
+  br i1 %cmp.not15, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp3.not8 = icmp ugt i32 8, %W
+  %conv.ripple.LS.instance = trunc i32 %W to i8
+  %conv.ripple.LS.instance.ripple.bcast.splatinsert = insertelement <64 x i8> poison, i8 %conv.ripple.LS.instance, i64 0
+  %conv.ripple.LS.instance.ripple.bcast.splat = shufflevector <64 x i8> %conv.ripple.LS.instance.ripple.bcast.splatinsert, <64 x i8> poison, <64 x i32> zeroinitializer
+  br label %for.cond1.preheader
+
+for.cond.loopexit:                                ; preds = %for.body5, %for.cond1.preheader
+  %add = add i32 %add17, 8
+  %cmp.not = icmp ugt i32 %add, %T
+  br i1 %cmp.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.cond.loopexit
+  %add17 = phi i32 [ 8, %for.cond1.preheader.lr.ph ], [ %add, %for.cond.loopexit ]
+  %t.016 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add17, %for.cond.loopexit ]
+  br i1 %cmp3.not8, label %for.cond.loopexit, label %for.body5.lr.ph
+
+for.body5.lr.ph:                                  ; preds = %for.cond1.preheader
+  %gep14 = getelementptr i8, ptr %invariant.gep13, i32 %t.016
+  br label %for.body5
+
+for.cond.cleanup:                                 ; preds = %for.cond.loopexit, %entry
+  ret void
+
+for.body5:                                        ; preds = %for.body5.lr.ph, %for.body5
+  %add210 = phi i32 [ 8, %for.body5.lr.ph ], [ %add2, %for.body5 ]
+  %w.09 = phi i32 [ 0, %for.body5.lr.ph ], [ %add210, %for.body5 ]
+  %gep = getelementptr i8, ptr %gep14, i32 %w.09
+  %gep.ripple.LS.instance = getelementptr i8, ptr %gep, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %conv.ripple.LS.instance.ripple.bcast.splat, <64 x ptr> %gep.ripple.LS.instance, i32 1, <64 x i1> splat (i1 true))
+  %add2 = add i32 %add210, 8
+  %cmp3.not = icmp ugt i32 %add2, %W
+  br i1 %cmp3.not, label %for.cond.loopexit, label %for.body5
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.ripple.block.setsize.i32(i32 immarg %0, i32 immarg %1, i32 %2) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.index.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.getsize.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #3
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
new file mode 100644
index 0000000..83fd63e
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_gather_32:
+; CHECK: vtmp.w = vgather
+; CHECK-LABEL: Ripple_gather_16:
+; CHECK: vtmp.h = vgather
+; CHECK-LABEL: Ripple_gather_8:
+; CHECK: vand
+; CHECK: vpacke
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <32 x ptr> %source.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+  %0 = load <32 x i32>, ptr %indexes, align 4
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i32, <32 x ptr> %source.ripple.bcast.splat, <32 x i32> %0
+  %1 = tail call <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr> %arrayidx2.ripple.vectorized, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i32> poison)
+  store <32 x i32> %1, ptr %destination, align 4
+  ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <64 x ptr> %source.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %0 = load <64 x i16>, ptr %indexes, align 2
+  %idxprom.ripple.vectorized = zext <64 x i16> %0 to <64 x i32>
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i16, <64 x ptr> %source.ripple.bcast.splat, <64 x i32> %idxprom.ripple.vectorized
+  %1 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.vectorized, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+  store <64 x i16> %1, ptr %destination, align 2
+  ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <128 x ptr> %source.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+  %0 = load <128 x i8>, ptr %indexes, align 1
+  %idxprom.ripple.vectorized = zext <128 x i8> %0 to <128 x i32>
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i8, <128 x ptr> %source.ripple.bcast.splat, <128 x i32> %idxprom.ripple.vectorized
+  %1 = tail call <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %arrayidx2.ripple.vectorized, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <128 x i8> poison)
+  store <128 x i8> %1, ptr %destination, align 1
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i32>) #1
+declare <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr>, i32 immarg, <64 x i1>, <64 x i16>) #1
+declare <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %0, i32 immarg %1, <128 x i1> %2, <128 x i8> %3) #1
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
new file mode 100644
index 0000000..1bd79d7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
@@ -0,0 +1,54 @@
+; Verify that we generate HVX vgather for the given input.
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+; CHECK-LABEL: SpVV_Ripple:
+; CHECK: vtmp.h = vgather(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local i32 @SpVV_Ripple(ptr nocapture noundef writeonly %scratchpad, ptr nocapture noundef readonly %Source_1, ptr nocapture noundef readonly %S_index, i32 noundef %nS, ptr nocapture noundef readonly %Source_2) local_unnamed_addr #1 {
+entry:
+  %Source_2.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %Source_2, i64 0
+  %Source_2.ripple.bcast.splat = shufflevector <64 x ptr> %Source_2.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %div16 = lshr i32 %nS, 6
+  %cmp6.not = icmp ult i32 %nS, 64
+  br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %lsr.iv17 = phi ptr [ %scevgep18, %for.body ], [ %S_index, %entry ]
+  %lsr.iv = phi ptr [ %scevgep, %for.body ], [ %Source_1, %entry ]
+  %result.08.ripple.vectorized = phi <64 x i32> [ %add8.ripple.vectorized, %for.body ], [ zeroinitializer, %entry ]
+  %_ripple_block_0.07 = phi i32 [ %add9, %for.body ], [ 0, %entry ]
+  %.ripple.LS.instance = load <64 x i16>, ptr %lsr.iv17, align 2
+  %idxprom.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance to <64 x i32>
+  %arrayidx2.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %Source_2.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+  %.ripple.LS.instance13 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+  store <64 x i16> %.ripple.LS.instance13, ptr %scratchpad, align 2
+  %.ripple.LS.instance15 = load <64 x i16>, ptr %lsr.iv, align 2
+  %conv.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance15 to <64 x i32>
+  %conv6.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance13 to <64 x i32>
+  %mul7.ripple.LS.instance = mul nsw <64 x i32> %conv.ripple.LS.instance, %conv6.ripple.LS.instance
+  %add8.ripple.vectorized = add <64 x i32> %mul7.ripple.LS.instance, %result.08.ripple.vectorized
+  %add9 = add nuw nsw i32 %_ripple_block_0.07, 1
+  %scevgep = getelementptr i8, ptr %lsr.iv, i32 128
+  %scevgep18 = getelementptr i8, ptr %lsr.iv17, i32 128
+  %cmp = icmp ult i32 %add9, %div16
+  br i1 %cmp, label %for.body, label %for.end
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa.ripple.LS.instance = phi <64 x i32> [ zeroinitializer, %entry ], [ %add8.ripple.vectorized, %for.body ]
+  %rdx.shuf = shufflevector <64 x i32> %result.0.lcssa.ripple.LS.instance, <64 x i32> poison, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx = add <64 x i32> %result.0.lcssa.ripple.LS.instance, %rdx.shuf
+  %rdx.shuf19 = shufflevector <64 x i32> %bin.rdx, <64 x i32> poison, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx20 = add <64 x i32> %bin.rdx, %rdx.shuf19
+  %rdx.shuf21 = shufflevector <64 x i32> %bin.rdx20, <64 x i32> poison, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx22 = add <64 x i32> %bin.rdx20, %rdx.shuf21
+  %rdx.shuf23 = shufflevector <64 x i32> %bin.rdx22, <64 x i32> poison, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx24 = add <64 x i32> %bin.rdx22, %rdx.shuf23
+  %rdx.shuf25 = shufflevector <64 x i32> %bin.rdx24, <64 x i32> poison, <64 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx26 = add <64 x i32> %bin.rdx24, %rdx.shuf25
+  %rdx.shuf27 = shufflevector <64 x i32> %bin.rdx26, <64 x i32> poison, <64 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx28 = add <64 x i32> %bin.rdx26, %rdx.shuf27
+  %0 = extractelement <64 x i32> %bin.rdx28, i32 0
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
new file mode 100644
index 0000000..85d2999
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=hexagon -mattr=+hvx-length128b,+hvxv73,+v73,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_scatter_8:
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK-LABEL: Ripple_scatter_16:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h = v{{[0-9]+}}
+; CHECK-LABEL: Ripple_scatter_32:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.w).w = v{{[0-9]+}}
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local void @Ripple_scatter_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <128 x ptr> %destination.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <128 x i8>, ptr %source, align 1
+  %.ripple.LS.instance = load <128 x i8>, ptr %indexes, align 1
+  %idxprom.ripple.LS.instance = zext <128 x i8> %.ripple.LS.instance to <128 x i32>
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i8, <128 x ptr> %destination.ripple.bcast.splat, <128 x i32> %idxprom.ripple.LS.instance
+  %cst_ptr_to_i32 = ptrtoint ptr %destination to i32
+  tail call void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %.ripple.LS.instance11, <128 x ptr> %arrayidx3.ripple.LS.instance, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define dso_local void @Ripple_scatter_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <64 x ptr> %destination.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <64 x i16>, ptr %source, align 2
+  %.ripple.LS.instance = load <64 x i16>, ptr %indexes, align 2
+  %idxprom.ripple.LS.instance = zext <64 x i16> %.ripple.LS.instance to <64 x i32>
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %destination.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+  tail call void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %.ripple.LS.instance11, <64 x ptr> %arrayidx3.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define dso_local void @Ripple_scatter_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <32 x ptr> %destination.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <32 x i32>, ptr %source, align 4
+  %.ripple.LS.instance = load <32 x i32>, ptr %indexes, align 4
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i32, <32 x ptr> %destination.ripple.bcast.splat, <32 x i32> %.ripple.LS.instance
+  tail call void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %.ripple.LS.instance11, <32 x ptr> %arrayidx3.ripple.LS.instance, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %0, <128 x ptr> %1, i32 immarg %2, <128 x i1> %3) #2
+declare void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #2
+declare void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %0, <32 x ptr> %1, i32 immarg %2, <32 x i1> %3) #2
diff --git a/llvm/test/CodeGen/Hexagon/masked_gather.ll b/llvm/test/CodeGen/Hexagon/masked_gather.ll
new file mode 100644
index 0000000..461fd79
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/masked_gather.ll
@@ -0,0 +1,58 @@
+; This produced masked gather that we are not yet handling
+; REQUIRES: asserts
+; RUN: opt -march=hexagon -passes=loop-vectorize -hexagon-autohvx -mattr=+hvx-length128b,+hvxv68,+v68,+hvx-ieee-fp,-long-calls,-packets -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Original C++
+; clang -c -Os -mhvx -mhvx-ieee-fp -fvectorize -mno-packets -fno-strict-aliasing -Os -mhvx -mhvx-ieee-fp  -mno-packets -mv68
+;typedef struct poptContext_s * poptContext;
+;typedef struct { unsigned int bits[1]; } pbm_set;
+;struct poptContext_s { pbm_set * arg_strip; };
+;
+;int poptStrippedArgv(poptContext con, int argc, char ** argv) {
+;  int numargs = argc;
+;   for (int i = 1; i < argc; i++) {
+;     if (((((con->arg_strip)->bits)[((i) / (8 * sizeof (unsigned int)))] & ((unsigned int) 1 << ((i) % (8 * sizeof (unsigned int))))) != 0))
+;     numargs--;
+;   }
+;    return numargs;
+;}
+
+; CHECK-NOT: masked_gather
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown-unknown-elf"
+
+; Function Attrs: nofree norecurse nosync nounwind optsize memory(read, inaccessiblemem: none)
+define dso_local i32 @poptStrippedArgv(ptr noundef readonly captures(none) %con, i32 noundef %argc, ptr noundef readnone captures(none) %argv) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %argc, 1
+  br i1 %cmp8, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load ptr, ptr %con, align 4
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %numargs.0.lcssa = phi i32 [ %argc, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %numargs.0.lcssa
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.010 = phi i32 [ 1, %for.body.lr.ph ], [ %inc, %for.body ]
+  %numargs.09 = phi i32 [ %argc, %for.body.lr.ph ], [ %spec.select, %for.body ]
+  %div7 = lshr i32 %i.010, 5
+  %arrayidx = getelementptr inbounds nuw [1 x i32], ptr %0, i32 0, i32 %div7
+  %1 = load i32, ptr %arrayidx, align 4
+  %rem = and i32 %i.010, 31
+  %shl = shl nuw i32 1, %rem
+  %and = and i32 %1, %shl
+  %cmp1.not = icmp ne i32 %and, 0
+  %dec = sext i1 %cmp1.not to i32
+  %spec.select = add nsw i32 %numargs.09, %dec
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %argc
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/CodeGen/Hexagon/vector-gather.ll b/llvm/test/CodeGen/Hexagon/vector-gather.ll
new file mode 100644
index 0000000..5700380
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vector-gather.ll
@@ -0,0 +1,27 @@
+; REQUIRES: hexagon-registered-target
+; RUN: llc -march=hexagon -mcpu=hexagonv73 -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+target triple = "hexagon"
+
+@VTCM_SCATTER16_ADDRESS = dso_local global i32 0, align 4
+@region_len = dso_local global i32 16383, align 4
+
+; CHECK: [[ADR:r[0-9]+]] = memw(gp+#VTCM_SCATTER16_ADDRESS)
+; CHECK: vtmp.h = vgather([[ADR]],m0,v0.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+define dso_local void @vector_gather_16(ptr noundef %vgather, <32 x i32> noundef %offsets) #0 {
+entry:
+  %vgather.addr = alloca ptr, align 4
+  %offsets.addr = alloca <32 x i32>, align 128
+  store ptr %vgather, ptr %vgather.addr, align 4
+  store <32 x i32> %offsets, ptr %offsets.addr, align 128
+  %0 = load ptr, ptr %vgather.addr, align 4
+  %1 = load i32, ptr @VTCM_SCATTER16_ADDRESS, align 4
+  %2 = load i32, ptr @region_len, align 4
+  %3 = load <32 x i32>, ptr %offsets.addr, align 128
+  call void @llvm.hexagon.V6.vgathermh.128B(ptr %0, i32 %1, i32 %2, <32 x i32> %3)
+  ret void
+}
+
+declare <128 x i1> @llvm.hexagon.V6.vandvrt.128B(<32 x i32>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rv32p.ll b/llvm/test/CodeGen/RISCV/rv32p.ll
new file mode 100644
index 0000000..4eee880a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv32p.ll
@@ -0,0 +1,709 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define i32 @ctlz_i32(i32 %a) nounwind {
+; CHECK-LABEL: ctlz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; CHECK-LABEL: ctlz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bnez a1, .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    clz a0, a1
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define i32 @cttz_i32(i32 %a) nounwind {
+; CHECK-LABEL: cttz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; CHECK-LABEL: cttz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or a2, a0, a1
+; CHECK-NEXT:    beqz a2, .LBB3_3
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    bnez a0, .LBB3_4
+; CHECK-NEXT:  # %bb.2: # %cond.false
+; CHECK-NEXT:    addi a0, a1, -1
+; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    j .LBB3_5
+; CHECK-NEXT:  .LBB3_3:
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:  .LBB3_5: # %cond.false
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+define i32 @sextb_i32(i32 %a) nounwind {
+; CHECK-LABEL: sextb_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; CHECK-LABEL: sextb_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    srai a1, a0, 31
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define i32 @sexth_i32(i32 %a) nounwind {
+; CHECK-LABEL: sexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: sexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    srai a1, a0, 31
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define i32 @min_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: min_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: min_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB9_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    slt a4, a1, a3
+; CHECK-NEXT:    beqz a4, .LBB9_3
+; CHECK-NEXT:    j .LBB9_4
+; CHECK-NEXT:  .LBB9_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    bnez a4, .LBB9_4
+; CHECK-NEXT:  .LBB9_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB9_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @max_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: max_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: max_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB11_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    slt a4, a3, a1
+; CHECK-NEXT:    beqz a4, .LBB11_3
+; CHECK-NEXT:    j .LBB11_4
+; CHECK-NEXT:  .LBB11_2:
+; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    bnez a4, .LBB11_4
+; CHECK-NEXT:  .LBB11_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB11_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @minu_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: minu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: minu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB13_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    beqz a4, .LBB13_3
+; CHECK-NEXT:    j .LBB13_4
+; CHECK-NEXT:  .LBB13_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    bnez a4, .LBB13_4
+; CHECK-NEXT:  .LBB13_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB13_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @maxu_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: maxu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: maxu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB15_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a3, a1
+; CHECK-NEXT:    beqz a4, .LBB15_3
+; CHECK-NEXT:    j .LBB15_4
+; CHECK-NEXT:  .LBB15_2:
+; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    bnez a4, .LBB15_4
+; CHECK-NEXT:  .LBB15_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB15_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+define i32 @abs_i32(i32 %x) {
+; CHECK-LABEL: abs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+define i64 @abs_i64(i64 %x) {
+; CHECK-LABEL: abs_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bgez a1, .LBB17_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    snez a2, a0
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    sub a1, a1, a2
+; CHECK-NEXT:  .LBB17_2:
+; CHECK-NEXT:    ret
+  %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
+  ret i64 %abs
+}
+
+define i32 @zexth_i32(i32 %a) nounwind {
+; CHECK-LABEL: zexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    srli a0, a0, 16
+; CHECK-NEXT:    ret
+  %and = and i32 %a, 65535
+  ret i32 %and
+}
+
+define i64 @zexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: zexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    srli a0, a0, 16
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %and = and i64 %a, 65535
+  ret i64 %and
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define i32 @bswap_i32(i32 %a) nounwind {
+; CHECK-LABEL: bswap_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK-LABEL: bswap_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a2, a1
+; CHECK-NEXT:    rev8 a1, a0
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
+
+define i32 @srai_slli(i16 signext %0) {
+; CHECK-LABEL: srai_slli:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 25
+; CHECK-NEXT:    srai a0, a0, 31
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 15
+  %3 = sext i16 %sext to i32
+  ret i32 %3
+}
+
+define i32 @srai_slli2(i16 signext %0) {
+; CHECK-LABEL: srai_slli2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 25
+; CHECK-NEXT:    srai a0, a0, 30
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 14
+  %3 = sext i16 %sext to i32
+  ret i32 %3
+}
+define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: sub_if_uge_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    zext.b a2, a0
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    zext.b a0, a0
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i8 %x, %y
+  %select = select i1 %cmp, i8 0, i8 %y
+  %sub = sub nuw i8 %x, %select
+  ret i8 %sub
+}
+
+define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: sub_if_uge_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i16 %x, %y
+  %select = select i1 %cmp, i16 0, i16 %y
+  %sub = sub nuw i16 %x, %select
+  ret i16 %sub
+}
+
+define i32 @sub_if_uge_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: sub_if_uge_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB27_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    j .LBB27_3
+; CHECK-NEXT:  .LBB27_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:  .LBB27_3:
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    sub a1, a1, a4
+; CHECK-NEXT:    sub a0, a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x, %y
+  %select = select i1 %cmp, i64 0, i64 %y
+  %sub = sub nuw i64 %x, %select
+  ret i64 %sub
+}
+
+define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: sub_if_uge_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a3, 4(a1)
+; CHECK-NEXT:    lw a4, 8(a1)
+; CHECK-NEXT:    lw a5, 12(a1)
+; CHECK-NEXT:    lw a6, 4(a2)
+; CHECK-NEXT:    lw t0, 12(a2)
+; CHECK-NEXT:    lw a7, 8(a2)
+; CHECK-NEXT:    beq a5, t0, .LBB28_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu t1, a5, t0
+; CHECK-NEXT:    j .LBB28_3
+; CHECK-NEXT:  .LBB28_2:
+; CHECK-NEXT:    sltu t1, a4, a7
+; CHECK-NEXT:  .LBB28_3:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    beq a3, a6, .LBB28_5
+; CHECK-NEXT:  # %bb.4:
+; CHECK-NEXT:    sltu t2, a3, a6
+; CHECK-NEXT:    j .LBB28_6
+; CHECK-NEXT:  .LBB28_5:
+; CHECK-NEXT:    sltu t2, a1, a2
+; CHECK-NEXT:  .LBB28_6:
+; CHECK-NEXT:    xor t3, a5, t0
+; CHECK-NEXT:    xor t4, a4, a7
+; CHECK-NEXT:    or t3, t4, t3
+; CHECK-NEXT:    beqz t3, .LBB28_8
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mv t2, t1
+; CHECK-NEXT:  .LBB28_8:
+; CHECK-NEXT:    addi t3, t2, -1
+; CHECK-NEXT:    and t2, t3, t0
+; CHECK-NEXT:    and t0, t3, a2
+; CHECK-NEXT:    and t1, t3, a6
+; CHECK-NEXT:    sltu a2, a1, t0
+; CHECK-NEXT:    and a7, t3, a7
+; CHECK-NEXT:    mv a6, a2
+; CHECK-NEXT:    beq a3, t1, .LBB28_10
+; CHECK-NEXT:  # %bb.9:
+; CHECK-NEXT:    sltu a6, a3, t1
+; CHECK-NEXT:  .LBB28_10:
+; CHECK-NEXT:    sub t3, a4, a7
+; CHECK-NEXT:    sltu a4, a4, a7
+; CHECK-NEXT:    sub a5, a5, t2
+; CHECK-NEXT:    sub a3, a3, t1
+; CHECK-NEXT:    sub a1, a1, t0
+; CHECK-NEXT:    sltu a7, t3, a6
+; CHECK-NEXT:    sub a5, a5, a4
+; CHECK-NEXT:    sub a4, t3, a6
+; CHECK-NEXT:    sub a3, a3, a2
+; CHECK-NEXT:    sub a2, a5, a7
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    sw a3, 4(a0)
+; CHECK-NEXT:    sw a4, 8(a0)
+; CHECK-NEXT:    sw a2, 12(a0)
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i128 %x, %y
+  %select = select i1 %cmp, i128 0, i128 %y
+  %sub = sub nuw i128 %x, %select
+  ret i128 %sub
+}
+
+define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a2, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    sll a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %shl = shl i32 %sub, %select
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    minu a2, a0, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB30_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    sll a0, a2, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB30_2:
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    sll a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %select2 = select i1 %cmp, i32 2, i32 4
+  %shl = shl i32 %sub, %select2
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_store_i32(i32 %x, i32 %y, ptr %z) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_store_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    xori a3, a3, 1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    sw a3, 0(a2)
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x, %y
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %select = select i1 %cmp, i32 %y, i32 0
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i8 @sub_if_uge_C_i8(i8 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -13
+; CHECK-NEXT:    zext.b a1, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i8 %x, 12
+  %sub = add i8 %x, -13
+  %conv4 = select i1 %cmp, i8 %sub, i8 %x
+  ret i8 %conv4
+}
+
+define i16 @sub_if_uge_C_i16(i16 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -251
+; CHECK-NEXT:    slli a1, a1, 16
+; CHECK-NEXT:    srli a1, a1, 16
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i16 %x, 250
+  %sub = add i16 %x, -251
+  %conv4 = select i1 %cmp, i16 %sub, i16 %x
+  ret i16 %conv4
+}
+
+define i32 @sub_if_uge_C_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i64 @sub_if_uge_C_i64(i64 %x) {
+; CHECK-LABEL: sub_if_uge_C_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    beq a1, a2, .LBB35_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltiu a2, a1, 2
+; CHECK-NEXT:    xori a2, a2, 1
+; CHECK-NEXT:    j .LBB35_3
+; CHECK-NEXT:  .LBB35_2:
+; CHECK-NEXT:    lui a2, 172127
+; CHECK-NEXT:    addi a2, a2, 511
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:  .LBB35_3:
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    andi a3, a2, -2
+; CHECK-NEXT:    add a1, a1, a3
+; CHECK-NEXT:    lui a3, 876449
+; CHECK-NEXT:    addi a3, a3, -512
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %x, 4999999999
+  %sub = add i64 %x, -5000000000
+  %cond = select i1 %cmp, i64 %sub, i64 %x
+  ret i64 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    lui a3, 1048560
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    addi a3, a3, 15
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_sub_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 1048560
+; CHECK-NEXT:    addi a2, a2, 15
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %sub = add i32 %x, -65521
+  store i32 %sub, ptr %z, align 4
+  %cmp = icmp ugt i32 %x, 65520
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_swapped_i32(i32 %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, 65521
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %cond
+}
+
+define i7 @sub_if_uge_C_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ugt i7 %x, -18
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %add, i7 %x
+  ret i7 %s
+}
+
+define i7 @sub_if_uge_C_swapped_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ult i7 %x, -17
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %x, i7 %add
+  ret i7 %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64p.ll b/llvm/test/CodeGen/RISCV/rv64p.ll
new file mode 100644
index 0000000..cb07f94
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64p.ll
@@ -0,0 +1,677 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define signext i32 @ctlz_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: ctlz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @log2_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: log2_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 31
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %2 = sub i32 31, %1
+  ret i32 %2
+}
+
+define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: log2_ceil_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = sub i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  %3 = sub i32 32, %2
+  ret i32 %3
+}
+
+define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: findLastSet_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a1, a0
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    xori a1, a1, 31
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+  %2 = xor i32 31, %1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 -1, i32 %2
+  ret i32 %4
+}
+
+define i32 @ctlz_lshr_i32(i32 signext %a) {
+; CHECK-LABEL: ctlz_lshr_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a0, a0, 1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    ret
+  %1 = lshr i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  ret i32 %2
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; CHECK-LABEL: ctlz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define signext i32 @cttz_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: cttz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: cttz_zero_undef_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  ret i32 %1
+}
+
+define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: findFirstSet_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a2, a0
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    li a2, 32
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    clzw a1, a1
+; CHECK-NEXT:    sub a2, a2, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    or a0, a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = icmp eq i32 %a, 0
+  %3 = select i1 %2, i32 -1, i32 %1
+  ret i32 %3
+}
+
+define signext i32 @ffs_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: ffs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a2, a0
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    li a2, 33
+; CHECK-NEXT:    seqz a0, a0
+; CHECK-NEXT:    clzw a1, a1
+; CHECK-NEXT:    sub a2, a2, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = add i32 %1, 1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 0, i32 %2
+  ret i32 %4
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; CHECK-LABEL: cttz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB10_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB10_2:
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+define signext i32 @sextb_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: sextb_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; CHECK-LABEL: sextb_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define signext i32 @sexth_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: sexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: sexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: min_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: min_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: max_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: max_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: minu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: minu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: maxu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: maxu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+define i32 @abs_i32(i32 %x) {
+; CHECK-LABEL: abs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+define signext i32 @abs_i32_sext(i32 signext %x) {
+; CHECK-LABEL: abs_i32_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+define i64 @abs_i64(i64 %x) {
+; CHECK-LABEL: abs_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
+  ret i64 %abs
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define signext i32 @bswap_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: bswap_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    srai a0, a0, 32
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+; Similar to bswap_i32 but the result is not sign extended.
+define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
+; CHECK-LABEL: bswap_i32_nosext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    srli a0, a0, 32
+; CHECK-NEXT:    sw a0, 0(a1)
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  store i32 %1, ptr %x
+  ret void
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK-LABEL: bswap_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
+
+define i64 @srai_slli(i16 signext %0) {
+; CHECK-LABEL: srai_slli:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 57
+; CHECK-NEXT:    srai a0, a0, 63
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 15
+  %3 = sext i16 %sext to i64
+  ret i64 %3
+}
+
+define i64 @srai_slli2(i16 signext %0) {
+; CHECK-LABEL: srai_slli2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 57
+; CHECK-NEXT:    srai a0, a0, 62
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 14
+  %3 = sext i16 %sext to i64
+  ret i64 %3
+}
+
+define signext i32 @func0000000000000001(i32 signext %0, i8 signext %1) #0 {
+; CHECK-LABEL: func0000000000000001:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    slli a1, a1, 59
+; CHECK-NEXT:    srai a1, a1, 63
+; CHECK-NEXT:    addw a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %2 = shl i8 %1, 3
+  %3 = ashr i8 %2, 7
+  %4 = sext i8 %3 to i32
+  %5 = add nsw i32 %4, %0
+  ret i32 %5
+}
+
+define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: sub_if_uge_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    zext.b a2, a0
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    zext.b a0, a0
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i8 %x, %y
+  %select = select i1 %cmp, i8 0, i8 %y
+  %sub = sub nuw i8 %x, %select
+  ret i8 %sub
+}
+
+define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: sub_if_uge_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i16 %x, %y
+  %select = select i1 %cmp, i16 0, i16 %y
+  %sub = sub nuw i16 %x, %select
+  ret i16 %sub
+}
+
+define i32 @sub_if_uge_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a0
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: sub_if_uge_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x, %y
+  %select = select i1 %cmp, i64 0, i64 %y
+  %sub = sub nuw i64 %x, %select
+  ret i64 %sub
+}
+
+define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: sub_if_uge_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB36_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    j .LBB36_3
+; CHECK-NEXT:  .LBB36_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:  .LBB36_3:
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    sub a1, a1, a4
+; CHECK-NEXT:    sub a0, a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i128 %x, %y
+  %select = select i1 %cmp, i128 0, i128 %y
+  %sub = sub nuw i128 %x, %select
+  ret i128 %sub
+}
+
+define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a1
+; CHECK-NEXT:    sext.w a3, a0
+; CHECK-NEXT:    sltu a2, a3, a2
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %shl = shl i32 %sub, %select
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a1
+; CHECK-NEXT:    sext.w a3, a0
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    bltu a3, a2, .LBB38_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB38_2:
+; CHECK-NEXT:    li a1, 2
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %select2 = select i1 %cmp, i32 2, i32 4
+  %shl = shl i32 %sub, %select2
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_store_i32(i32 signext %x, i32 signext %y, ptr %z) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_store_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    subw a1, a0, a1
+; CHECK-NEXT:    xori a3, a3, 1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    sw a3, 0(a2)
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x, %y
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %select = select i1 %cmp, i32 %y, i32 0
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i8 @sub_if_uge_C_i8(i8 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -13
+; CHECK-NEXT:    zext.b a1, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i8 %x, 12
+  %sub = add i8 %x, -13
+  %conv4 = select i1 %cmp, i8 %sub, i8 %x
+  ret i8 %conv4
+}
+
+define i16 @sub_if_uge_C_i16(i16 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -251
+; CHECK-NEXT:    slli a1, a1, 48
+; CHECK-NEXT:    srli a1, a1, 48
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i16 %x, 250
+  %sub = add i16 %x, -251
+  %conv4 = select i1 %cmp, i16 %sub, i16 %x
+  ret i16 %conv4
+}
+
+define i32 @sub_if_uge_C_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    addw a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i64 @sub_if_uge_C_i64(i64 %x) {
+; CHECK-LABEL: sub_if_uge_C_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1046192
+; CHECK-NEXT:    addi a1, a1, -761
+; CHECK-NEXT:    slli a1, a1, 9
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %x, 4999999999
+  %sub = add i64 %x, -5000000000
+  %cond = select i1 %cmp, i64 %sub, i64 %x
+  ret i64 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    lui a3, 1048560
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    addi a3, a3, 15
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    addw a3, a0, a3
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_sub_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 1048560
+; CHECK-NEXT:    addi a2, a2, 15
+; CHECK-NEXT:    addw a2, a0, a2
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %sub = add i32 %x, -65521
+  store i32 %sub, ptr %z, align 4
+  %cmp = icmp ugt i32 %x, 65520
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_swapped_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    addw a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, 65521
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %cond
+}
+
+define i7 @sub_if_uge_C_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ugt i7 %x, -18
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %add, i7 %x
+  ret i7 %s
+}
+
+define i7 @sub_if_uge_C_swapped_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ult i7 %x, -17
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %x, i7 %add
+  ret i7 %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll
new file mode 100644
index 0000000..5c0c6c1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfbfmin,+xsfvfbfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfbfmin,+xsfvfbfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x bfloat>  @intrinsic_sf_vfexp_v_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat>  @intrinsic_sf_vfexp_v_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat>  @intrinsic_sf_vfexp_v_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat>  @intrinsic_sf_vfexp_v_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat>  @intrinsic_sf_vfexp_v_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat>  @intrinsic_sf_vfexp_v_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 32 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll
new file mode 100644
index 0000000..2d97f73
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+xsfvfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+xsfvfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x half>  @intrinsic_sf_vfexp_v_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16(
+    <vscale x 1 x half> poison,
+    <vscale x 1 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x half> %a
+}
+
+define <vscale x 2 x half>  @intrinsic_sf_vfexp_v_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16(
+    <vscale x 2 x half> poison,
+    <vscale x 2 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x half> %a
+}
+
+define <vscale x 4 x half>  @intrinsic_sf_vfexp_v_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16(
+    <vscale x 4 x half> poison,
+    <vscale x 4 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x half> %a
+}
+
+define <vscale x 8 x half>  @intrinsic_sf_vfexp_v_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16(
+    <vscale x 8 x half> poison,
+    <vscale x 8 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x half> %a
+}
+
+define <vscale x 16 x half>  @intrinsic_sf_vfexp_v_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16(
+    <vscale x 16 x half> poison,
+    <vscale x 16 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x half> %a
+}
+
+define <vscale x 32 x half>  @intrinsic_sf_vfexp_v_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16(
+    <vscale x 32 x half> poison,
+    <vscale x 32 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x half> %a
+}
+
+define <vscale x 1 x half>  @intrinsic_sf_vfexp_mask_v_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16(
+    <vscale x 1 x half> %0,
+    <vscale x 1 x half> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x half> %a
+}
+
+define <vscale x 2 x half>  @intrinsic_sf_vfexp_mask_v_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16(
+    <vscale x 2 x half> %0,
+    <vscale x 2 x half> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x half> %a
+}
+
+define <vscale x 4 x half>  @intrinsic_sf_vfexp_mask_v_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16(
+    <vscale x 4 x half> %0,
+    <vscale x 4 x half> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x half> %a
+}
+
+define <vscale x 8 x half>  @intrinsic_sf_vfexp_mask_v_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16(
+    <vscale x 8 x half> %0,
+    <vscale x 8 x half> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x half> %a
+}
+
+define <vscale x 16 x half>  @intrinsic_sf_vfexp_mask_v_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16(
+    <vscale x 16 x half> %0,
+    <vscale x 16 x half> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x half> %a
+}
+
+define <vscale x 32 x half>  @intrinsic_sf_vfexp_mask_v_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16(
+    <vscale x 32 x half> %0,
+    <vscale x 32 x half> %1,
+    <vscale x 32 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 32 x half> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll
new file mode 100644
index 0000000..46dce14
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+xsfvfexp32e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+xsfvfexp32e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x float>  @intrinsic_sf_vfexp_v_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float>  @intrinsic_sf_vfexp_v_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float>  @intrinsic_sf_vfexp_v_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float>  @intrinsic_sf_vfexp_v_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float>  @intrinsic_sf_vfexp_v_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float>  @intrinsic_sf_vfexp_mask_v_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float>  @intrinsic_sf_vfexp_mask_v_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float>  @intrinsic_sf_vfexp_mask_v_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float>  @intrinsic_sf_vfexp_mask_v_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float>  @intrinsic_sf_vfexp_mask_v_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll
new file mode 100644
index 0000000..d3d10d2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll
@@ -0,0 +1,335 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zve64f,+zvfh,+xsfvfexpa \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64f,+zvfh,+xsfvfexpa \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x float> @test_intrinsic_sf_vfexpa_v_nxv1f32(<vscale x 1 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32(
+      <vscale x 1 x float> poison,
+      <vscale x 1 x float> %0,
+      iXLen %1)
+    ret <vscale x 1 x float> %f
+}
+
+define <vscale x 2 x float> @test_intrinsic_sf_vfexpa_v_nxv2f32(<vscale x 2 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32(
+      <vscale x 2 x float> poison,
+      <vscale x 2 x float> %0,
+      iXLen %1)
+    ret <vscale x 2 x float> %f
+}
+
+define <vscale x 4 x float> @test_intrinsic_sf_vfexpa_v_nxv4f32(<vscale x 4 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32(
+      <vscale x 4 x float> poison,
+      <vscale x 4 x float> %0,
+      iXLen %1)
+    ret <vscale x 4 x float> %f
+}
+
+define <vscale x 8 x float> @test_intrinsic_sf_vfexpa_v_nxv8f32(<vscale x 8 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32(
+      <vscale x 8 x float> poison,
+      <vscale x 8 x float> %0,
+      iXLen %1)
+    ret <vscale x 8 x float> %f
+}
+
+define <vscale x 16 x float> @test_intrinsic_sf_vfexpa_v_nxv16f32(<vscale x 16 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32(
+      <vscale x 16 x float> poison,
+      <vscale x 16 x float> %0,
+      iXLen %1)
+    ret <vscale x 16 x float> %f
+}
+
+define <vscale x 1 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32(
+      <vscale x 1 x float> %0,
+      <vscale x 1 x float> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x float> %f
+}
+
+define <vscale x 2 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32(
+      <vscale x 2 x float> %0,
+      <vscale x 2 x float> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x float> %f
+}
+
+define <vscale x 4 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32(
+      <vscale x 4 x float> %0,
+      <vscale x 4 x float> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x float> %f
+}
+
+define <vscale x 8 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32(
+      <vscale x 8 x float> %0,
+      <vscale x 8 x float> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x float> %f
+}
+
+define <vscale x 16 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32(
+      <vscale x 16 x float> %0,
+      <vscale x 16 x float> %1,
+      <vscale x 16 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 16 x float> %f
+}
+
+define <vscale x 1 x half> @test_intrinsic_sf_vfexpa_v_nxv1f16(<vscale x 1 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16(
+      <vscale x 1 x half> poison,
+      <vscale x 1 x half> %0,
+      iXLen %1)
+    ret <vscale x 1 x half> %f
+}
+
+define <vscale x 2 x half> @test_intrinsic_sf_vfexpa_v_nxv2f16(<vscale x 2 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16(
+      <vscale x 2 x half> poison,
+      <vscale x 2 x half> %0,
+      iXLen %1)
+    ret <vscale x 2 x half> %f
+}
+
+define <vscale x 4 x half> @test_intrinsic_sf_vfexpa_v_nxv4f16(<vscale x 4 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16(
+      <vscale x 4 x half> poison,
+      <vscale x 4 x half> %0,
+      iXLen %1)
+    ret <vscale x 4 x half> %f
+}
+
+define <vscale x 8 x half> @test_intrinsic_sf_vfexpa_v_nxv8f16(<vscale x 8 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16(
+      <vscale x 8 x half> poison,
+      <vscale x 8 x half> %0,
+      iXLen %1)
+    ret <vscale x 8 x half> %f
+}
+
+define <vscale x 16 x half> @test_intrinsic_sf_vfexpa_v_nxv16f16(<vscale x 16 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16(
+      <vscale x 16 x half> poison,
+      <vscale x 16 x half> %0,
+      iXLen %1)
+    ret <vscale x 16 x half> %f
+}
+
+define <vscale x 32 x half> @test_intrinsic_sf_vfexpa_v_nxv32f16(<vscale x 32 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16(
+      <vscale x 32 x half> poison,
+      <vscale x 32 x half> %0,
+      iXLen %1)
+    ret <vscale x 32 x half> %f
+}
+
+define <vscale x 1 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16(
+      <vscale x 1 x half> %0,
+      <vscale x 1 x half> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x half> %f
+}
+
+define <vscale x 2 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16(
+      <vscale x 2 x half> %0,
+      <vscale x 2 x half> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x half> %f
+}
+
+define <vscale x 4 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16(
+      <vscale x 4 x half> %0,
+      <vscale x 4 x half> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x half> %f
+}
+
+define <vscale x 8 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16(
+      <vscale x 8 x half> %0,
+      <vscale x 8 x half> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x half> %f
+}
+
+define <vscale x 16 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16(
+      <vscale x 16 x half> %0,
+      <vscale x 16 x half> %1,
+      <vscale x 16 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 16 x half> %f
+}
+
+define <vscale x 32 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16(
+      <vscale x 32 x half> %0,
+      <vscale x 32 x half> %1,
+      <vscale x 32 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 32 x half> %f
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll
new file mode 100644
index 0000000..3de0e93
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfvfexpa64e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfvfexpa64e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x double> @test_intrinsic_sf_vfexpa_v_nxv1f64(<vscale x 1 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64(
+      <vscale x 1 x double> poison,
+      <vscale x 1 x double> %0,
+      iXLen %1)
+    ret <vscale x 1 x double> %f
+}
+
+define <vscale x 2 x double> @test_intrinsic_sf_vfexpa_v_nxv2f64(<vscale x 2 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64(
+      <vscale x 2 x double> poison,
+      <vscale x 2 x double> %0,
+      iXLen %1)
+    ret <vscale x 2 x double> %f
+}
+
+define <vscale x 4 x double> @test_intrinsic_sf_vfexpa_v_nxv4f64(<vscale x 4 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64(
+      <vscale x 4 x double> poison,
+      <vscale x 4 x double> %0,
+      iXLen %1)
+    ret <vscale x 4 x double> %f
+}
+
+define <vscale x 8 x double> @test_intrinsic_sf_vfexpa_v_nxv8f64(<vscale x 8 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64(
+      <vscale x 8 x double> poison,
+      <vscale x 8 x double> %0,
+      iXLen %1)
+    ret <vscale x 8 x double> %f
+}
+
+define <vscale x 1 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64(
+      <vscale x 1 x double> %0,
+      <vscale x 1 x double> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x double> %f
+}
+
+define <vscale x 2 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64(
+      <vscale x 2 x double> %0,
+      <vscale x 2 x double> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x double> %f
+}
+
+define <vscale x 4 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64(
+      <vscale x 4 x double> %0,
+      <vscale x 4 x double> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x double> %f
+}
+
+define <vscale x 8 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64(
+      <vscale x 8 x double> %0,
+      <vscale x 8 x double> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x double> %f
+}
diff --git a/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s b/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s
new file mode 100644
index 0000000..cb44a76
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc --filetype=obj --gsframe -triple x86_64 %s -o %t.o 2>&1 | FileCheck %s
+# RUN: llvm-readelf --sframe %t.o | FileCheck %s --check-prefix=NOFDES
+
+## Tests that .cfi_escape sequences that are unrepresentable in sframe warn
+## and do not produce FDEs.
+
+        .align 1024
+cfi_escape_sp:
+        .cfi_startproc
+        .long 0
+## Setting SP via other registers makes it unrepresentable in sframe
+## DW_CFA_expression,reg 0x7,length 2,DW_OP_breg6,SLEB(-8)
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE; .cfi_escape DW_CFA_expression with SP reg 7
+        .cfi_escape 0x10, 0x7, 0x2, 0x76, 0x78
+        .long 0
+.cfi_endproc
+
+cfi_escape_args_sp:
+        .cfi_startproc
+        .long 0
+## DW_CFA_GNU_args_size is not OK if cfa is SP
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE; .cfi_escape DW_CFA_GNU_args_size with non frame-pointer CFA
+        .cfi_escape 0x2e, 0x20
+        .cfi_endproc
+
+cfi_escape_val_offset:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+## DW_CFA_val_offset,rbp,ULEB scaled offset(16)
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE;  .cfi_escape DW_CFA_val_offset with FP reg 6
+        .cfi_escape 0x14,0x6,0x2
+        .long 0
+        .cfi_endproc
+
+# NOFDES: Num FDEs: 0
diff --git a/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s b/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s
new file mode 100644
index 0000000..df8e7d2
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s
@@ -0,0 +1,46 @@
+# RUN: llvm-mc --filetype=obj --gsframe -triple x86_64 %s -o %t.o
+# RUN: llvm-readelf --sframe %t.o | FileCheck %s
+
+## Tests that .cfi_escape sequences that are ok to pass through work.
+
+        .align 1024
+cfi_escape_ok:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+        ## Uninteresting register
+## DW_CFA_expression,reg 0xc,length 2,DW_OP_breg6,SLEB(-8)
+        .cfi_escape 0x10,0xc,0x2,0x76,0x78
+## DW_CFA_nop
+        .cfi_escape 0x0
+        .cfi_escape 0x0,0x0,0x0,0x0
+        ## Uninteresting register
+## DW_CFA_val_offset,reg 0xc,ULEB scaled offset
+        .cfi_escape 0x14,0xc,0x4
+        .long 0
+        .cfi_endproc
+
+cfi_escape_gnu_args_fp:
+        .cfi_startproc
+        .long 0
+## DW_CFA_GNU_args_size is OK if arg size is zero
+        .cfi_escape 0x2e, 0x0
+        .long 0
+        .cfi_def_cfa_register 6
+        .long 0
+## DW_CFA_GNU_args_size is OK if cfa is FP
+        .cfi_escape 0x2e, 0x20
+        .cfi_endproc
+
+cfi_escape_long_expr:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+## This is a long, but valid, dwarf expression without sframe
+## implications. An FDE can still be created.
+## DW_CFA_val_offset,rcx,ULEB scaled offset(16), DW_CFA_expr,r10,length,DW_OP_deref,SLEB(-8)
+        .cfi_escape 0x14,0x2,0x2,0x10,0xa,0x2,0x76,0x78
+        .long 0
+        .cfi_endproc
+
+# CHECK: Num FDEs: 3
diff --git a/llvm/test/MC/Hexagon/arch-support.s b/llvm/test/MC/Hexagon/arch-support.s
index eb362a7..94a6eb1 100644
--- a/llvm/test/MC/Hexagon/arch-support.s
+++ b/llvm/test/MC/Hexagon/arch-support.s
@@ -10,6 +10,7 @@
 # RUN: llvm-mc -triple=hexagon -mv73 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V73 %s
 # RUN: llvm-mc -triple=hexagon -mv75 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V75 %s
 # RUN: llvm-mc -triple=hexagon -mv79 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V79 %s
+# RUN: llvm-mc -triple=hexagon -mv81 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V81 %s
 
 ## Check which arch version llvm-mc sets when the user does not provide one.
 # RUN: llvm-mc -triple=hexagon -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-DEFAULT %s
@@ -26,6 +27,7 @@
 # RUN: llvm-mc -triple=hexagon -mv73 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
 # RUN: llvm-mc -triple=hexagon -mv75 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
 # RUN: llvm-mc -triple=hexagon -mv79 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -triple=hexagon -mv81 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
     .text
 r1 = r1
 
@@ -41,6 +43,7 @@ r1 = r1
 # CHECK-V73: Flags:{{.*}}0x73
 # CHECK-V75: Flags:{{.*}}0x75
 # CHECK-V79: Flags:{{.*}}0x79
+# CHECK-V81: Flags:{{.*}}0x81
 # CHECK-DEFAULT: Flags:{{.*}}0x68
 
 # CHECK-OBJDUMP: { r1 = r1 }
diff --git a/llvm/test/MC/Hexagon/v81_arch.s b/llvm/test/MC/Hexagon/v81_arch.s
new file mode 100644
index 0000000..0cd5d6b
--- /dev/null
+++ b/llvm/test/MC/Hexagon/v81_arch.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv81 -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv81 -mhvx -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+
+r1=memw(r0)
+{ r0=r0
+  memw(r0)=r0.new }
+
+# CHECK: { r1 = memw(r0+#0x0) }
+# CHECK: { r0 = r0
+# CHECK:   memw(r0+#0x0) = r0.new }
diff --git a/llvm/test/MC/PowerPC/ppc64-encoding-ext.s b/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
index 959f3c5..6662220 100644
--- a/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
+++ b/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
@@ -3491,12 +3491,18 @@
 # CHECK-BE: mfamr 2                         # encoding: [0x7c,0x5d,0x02,0xa6]
 # CHECK-LE: mfamr 2                         # encoding: [0xa6,0x02,0x5d,0x7c]
             mfamr 2
-# CHECK-BE: mtpid 2                         # encoding: [0x7c,0x50,0x0b,0xa6]
-# CHECK-LE: mtpid 2                         # encoding: [0xa6,0x0b,0x50,0x7c]
+# CHECK-BE: mtspr 48, 2                     # encoding: [0x7c,0x50,0x0b,0xa6]
+# CHECK-LE: mtspr 48, 2                     # encoding: [0xa6,0x0b,0x50,0x7c]
             mtpid 2
-# CHECK-BE: mfpid 2                         # encoding: [0x7c,0x50,0x0a,0xa6]
-# CHECK-LE: mfpid 2                         # encoding: [0xa6,0x0a,0x50,0x7c]
+# CHECK-BE: mtspr 48, 2                     # encoding: [0x7c,0x50,0x0b,0xa6]
+# CHECK-LE: mtspr 48, 2                     # encoding: [0xa6,0x0b,0x50,0x7c]
+            mtpidr 2
+# CHECK-BE: mfspr 2, 48                     # encoding: [0x7c,0x50,0x0a,0xa6]
+# CHECK-LE: mfspr 2, 48                     # encoding: [0xa6,0x0a,0x50,0x7c]
             mfpid 2
+# CHECK-BE: mfspr 2, 48                     # encoding: [0x7c,0x50,0x0a,0xa6]
+# CHECK-LE: mfspr 2, 48                     # encoding: [0xa6,0x0a,0x50,0x7c]
+            mfpidr 2
 # CHECK-BE: mtlr 2                          # encoding: [0x7c,0x48,0x03,0xa6]
 # CHECK-LE: mtlr 2                          # encoding: [0xa6,0x03,0x48,0x7c]
             mtlr 2
diff --git a/llvm/test/Transforms/InstCombine/ctlz-cttz.ll b/llvm/test/Transforms/InstCombine/ctlz-cttz.ll
new file mode 100644
index 0000000..871fb34
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ctlz-cttz.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+; ctpop(~i & (i - 1)) -> bitwidth - cttz(i, false)
+define i8 @ctlz_to_sub_bw_cttz(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_poison(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_poison(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 true)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_different_add(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_different_add(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], 1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, 1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_different_xor(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_different_xor(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, 1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+declare void @use(i8)
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_dec(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_dec(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    call void @use(i8 [[DEC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  call void @use(i8 %dec)
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_not(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_not(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    call void @use(i8 [[NOT]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  call void @use(i8 %not)
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_and(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_and(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    call void @use(i8 [[AND]])
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  call void @use(i8 %and)
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_commute_and(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_commute_and(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %not, %dec
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define <2 x i8> @ctlz_to_sub_bw_cttz_vec_splat(<2 x i8> %a0) {
+; CHECK-LABEL: define <2 x i8> @ctlz_to_sub_bw_cttz_vec_splat(
+; CHECK-SAME: <2 x i8> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.cttz.v2i8(<2 x i8> [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw <2 x i8> splat (i8 8), [[TMP1]]
+; CHECK-NEXT:    ret <2 x i8> [[CLZ]]
+;
+  %dec = add <2 x i8> %a0, <i8 -1, i8 -1>
+  %not = xor <2 x i8> %a0, <i8 -1, i8 -1>
+  %and = and <2 x i8> %dec, %not
+  %clz = tail call <2 x i8>@llvm.ctlz.v2i8(<2 x i8> %and, i1 false)
+  ret <2 x i8> %clz
+}
diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll
index c0be5b9..2ae062cd 100644
--- a/llvm/test/Transforms/InstCombine/scmp.ll
+++ b/llvm/test/Transforms/InstCombine/scmp.ll
@@ -519,9 +519,7 @@ define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
 define i32 @scmp_sgt_slt(i32 %a) {
 ; CHECK-LABEL: define i32 @scmp_sgt_slt(
 ; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr i32 [[A]], 31
-; CHECK-NEXT:    [[CMP_INV:%.*]] = icmp slt i32 [[A]], 1
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[CMP_INV]], i32 [[A_LOBIT]], i32 1
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
   %cmp = icmp sgt i32 %a, 0
@@ -747,3 +745,55 @@ define i8 @scmp_from_select_eq_and_gt_neg3(i32 %x, i32 %y) {
   %r = select i1 %eq, i8 0, i8 %sel1
   ret i8 %r
 }
+
+define i32 @scmp_ashr(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_ashr(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %a.lobit = ashr i32 %a, 31
+  %cmp.inv = icmp slt i32 %a, 1
+  %retval.0 = select i1 %cmp.inv, i32 %a.lobit, i32 1
+  ret i32 %retval.0
+}
+
+; select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0)
+define i8 @scmp_ashr_sgt_pattern(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_sgt_pattern(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[A]], i8 0)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a.lobit = ashr i8 %a, 7
+  %cmp = icmp sgt i8 %a, 0
+  %retval = select i1 %cmp, i8 1, i8 %a.lobit
+  ret i8 %retval
+}
+
+; select (icmp slt X, 1), ashr X, bitwidth-1, 1 -> scmp(X, 0)
+define i8 @scmp_ashr_slt_pattern(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_slt_pattern(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[A]], i8 0)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a.lobit = ashr i8 %a, 7
+  %cmp = icmp slt i8 %a, 1
+  %retval = select i1 %cmp, i8 %a.lobit, i8 1
+  ret i8 %retval
+}
+
+define i8 @scmp_ashr_slt_pattern_neg(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_slt_pattern_neg(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr i8 [[A]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[A]], 1
+; CHECK-NEXT:    [[RETVAL:%.*]] = select i1 [[CMP]], i8 [[A_LOBIT]], i8 1
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+  %a.lobit = ashr i8 %a, 4
+  %cmp = icmp slt i8 %a, 1
+  %retval = select i1 %cmp, i8 %a.lobit, i8 1
+  ret i8 %retval
+}
diff --git a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
index 43fb260..d981626 100644
--- a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
+++ b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
-; RUN: opt -safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
 ; RUN: opt -passes=safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
-; RUN: opt -passes=safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
 
 define void @foo() nounwind uwtable safestack sspreq {
 entry:
@@ -10,7 +8,6 @@ entry:
 
 ; TLS: %[[TP2:.*]] = call ptr @llvm.thread.pointer.p0()
 ; ANDROID: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 40
-; FUCHSIA: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 -16
 ; TLS: %[[StackGuard:.*]] = load ptr, ptr %[[B]]
 ; TLS: store ptr %[[StackGuard]], ptr %[[StackGuardSlot:.*]]
   %a = alloca i128, align 16
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
index 49af4df..c20409e 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
@@ -6864,7 +6864,7 @@ zip2	z31.s, z31.s, z31.s
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]
-# CHECK-NEXT:  -      -      -      -      -      -      -     245.00 651.00 651.00 570.50 272.50 83.75  83.75  81.75  81.75  1536.75 1281.75 794.25 748.25
+# CHECK-NEXT:  -      -      -      -      -      -      -     245.00 651.00 651.00 570.50 272.50 83.75  83.75  81.75  81.75  1540.75 1285.75 790.25 744.25
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
@@ -9617,39 +9617,39 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -      -      -      -      -     9.00   9.00    -      -     st4w	{ z21.s - z24.s }, p5, [x10, #20, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -      -      -      -      -     9.00   9.00    -      -     st4w	{ z23.s - z26.s }, p3, [x13, #-32, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -     4.50   4.50   4.50   4.50   9.00   9.00    -      -     st4w	{ z5.s - z8.s }, p3, [x17, x16, lsl #2]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z0.b }, p0, [x0, x0]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z0.b }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z0.b }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z0.b }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z0.d }, p0, [z1.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z21.b }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z23.b }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z21.b }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z23.b }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z31.s }, p7, [z31.s, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z31.s }, p7, [z31.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z0.d }, p0, [x0, x0, lsl #3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z0.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z0.d }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z21.d }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z23.d }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z21.d }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z23.d }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.25   0.25   0.25   0.25   0.25   0.25   0.25   0.25   stnt1h	{ z0.h }, p0, [x0, x0, lsl #1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z0.h }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.25   0.25   0.25   0.25   0.50   0.50    -      -     stnt1h	{ z0.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z0.h }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z21.h }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z23.h }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z21.h }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z23.h }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z31.s }, p7, [z31.s, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z31.s }, p7, [z31.s]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z0.s }, p0, [x0, x0, lsl #2]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z0.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z0.s }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1w	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z21.s }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z23.s }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z21.s }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z23.s }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1w	{ z31.s }, p7, [z31.s, x0]
diff --git a/llvm/test/tools/llvm-profdata/input-wildcard.test b/llvm/test/tools/llvm-profdata/input-wildcard.test
new file mode 100644
index 0000000..f2c46c9
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/input-wildcard.test
@@ -0,0 +1,15 @@
+# This test verifies that llvm-profdata will do wildcard expansion on its
+# arguments. The expansion is done by Windows-specific support in InitLLVM, so
+# we only expect this to work on Windows hosts.
+# REQUIRES: system-windows
+
+# Create two files to glob.
+RUN: echo '# empty profile 1' >  %t.prof1.proftxt
+RUN: echo '# empty profile 2' >> %t.prof2.proftxt
+
+# Prevent LIT itself from globbing by quoting the wildcard argument.
+RUN: llvm-profdata merge "%t.*.proftxt" -dump-input-file-list -o /dev/null | FileCheck %s
+
+# Verify that llvm-profdata expanded the wildcard argument.
+CHECK: 1,{{.*}}.prof1.proftxt
+CHECK-NEXT: 1,{{.*}}.prof2.proftxt
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 15ddb05..a356bcd 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -34,7 +34,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/LLVMDriver.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -3465,6 +3465,7 @@ static int order_main() {
 }
 
 int main(int argc, const char *argv[]) {
+  InitLLVM X(argc, argv);
   StringRef ProgName(sys::path::filename(argv[0]));
 
   if (argc < 2) {
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index de6cc8f..7b563d7 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
   IRReader
   JITLink
   Object
+  ObjectYAML
   OrcDebugging
   OrcJIT
   OrcShared
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 7184de9..d74abc2 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -912,12 +912,10 @@ def ScaledMFMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[32], [F8E5M2, F8E4M3FN
                                    VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>;
 def ScaledMFMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 16], [F32]>]>;
 // wmma
-def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<
-                             [4, 8, 16],
-                             [F16, BF16,
-                              I8, SI8, UI8,
-                              I<4>, SI<4>, UI<4>,
-                              F8E4M3FN, F8E5M2]>]>;
+def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>,
+                             VectorOfLengthAndType<[4, 8, 16], [I8, SI8, UI8]>,
+                             VectorOfLengthAndType<[4, 8], [F8E4M3FN, F8E5M2]>,
+                             VectorOfLengthAndType<[4, 8, 16], [I<4>, SI<4>, UI<4>]>]>;
 def WMMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8], [F32, I32]>,
                               VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>]>;
 
@@ -968,6 +966,14 @@ def AMDGPU_MFMAOp :
 
     The negateA, negateB, and negateC flags are only supported for double-precision
     operations on gfx94x.
+
+    Example:
+    ```mlir
+      %0 = amdgpu.mfma %matA * %matB + %matC
+        { abid = 1 : i32, cbsz = 1 : i32,
+          m = 32 : i32, n = 32 : i32, k = 1 : i32, blocks = 2 : i32 }
+        blgp = bcast_second_32 : f32, f32, vector<32xf32>
+    ```
   }];
   let assemblyFormat = [{
     $sourceA `*` $sourceB `+` $destC
@@ -982,36 +988,43 @@ def AMDGPU_WMMAOp :
     AMDGPU_Op<"wmma", [AllTypesMatch<["destC", "destD"]>,
                        Pure]>,
     Arguments<(ins
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[16]>]>:$m,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[16]>]>:$n,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$k,
                    WMMAInTypes:$sourceA,
                    WMMAInTypes:$sourceB,
                    WMMAOutTypes:$destC,
-                   DefaultValuedAttr<ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>, "0">:$subwordOffset,
+                   DefaultValuedAttr<ConfinedAttr<I32Attr, [IntIsOneOf<[0, 1]>]>, "0">:$subwordOffset,
                    UnitAttr:$unsignedA,
                    UnitAttr:$unsignedB,
                    UnitAttr:$clamp)>,
     Results<(outs WMMAOutTypes: $destD)> {
-  let summary = "MLIR wrapper for RDNA3 wmma instructions";
+  let summary = "MLIR wrapper for wmma instructions";
   let description = [{
-    The `amdgpu.wmma` op is an MLIR wrapper around intrinsics
-    for various `wmma` instructions in the RDNA3 or RDNA4 architecture, which
-    perform a 16x16 * 16x16 matrix multiplication for different data types.
-    Note that in gfx12/RDNA4, there is also a 16x32 * 32x16 instruction for 4-bit
-    integer inputs.
+    The `amdgpu.wmma` op is an MLIR wrapper around intrinsics for various `wmma`
+    instructions in the AMDGPU architecture, which perform matrix multiplication.
+    Note that all wmma intrinsics have M=N=16 dimensions but vary by in allowed K
+    dimensions.
 
     On gfx11/RDNA3, emitting f16->f16 (or bf16->bf16) wmma the output is a 16xf16
     (or 16xbf16) vector containing only 8 valid values:
       - If `subwordOffset` is 0, then the output is stored at indices 0, 2, 4, ..., 14.
       - If `subwordOffset` is 1, then the output is stored at indices 1, 3, 5, ..., 15.
-    On gfx12/RDNA4, the result is instead returned as a vector<8 x f16/bf16> where
-    all values are valid and the `subwordOffset` must be `0`, as it cannot be used.
+    On gfx12/RDNA4 and gfx1250, the result is instead returned as vector where all
+    the values are valid and the `subwordOffset` must be `0`, as it cannot be used.
 
     `unsignedA` and `unsignedB` flag that the `int8` LLVM inputs are unsigned.
 
-    The `clamp` flag is used to saturate the output of type T to numeric_limits<T>::max()
+    The `clamp` flag is used to saturate the output of type T to `numeric_limits<T>::max()`
     in case of overflow.
+
+    Example:
+    ```mlir
+      %0 = amdgpu.wmma 16x16x16 %matA * %matB + %matC : vector<16xf16>, vector<16xf16>, vector<8xf16>
+    ```
   }];
   let assemblyFormat = [{
-    $sourceA `*` $sourceB `+` $destC
+    custom<MNKDimensionList>($m, $n, $k) $sourceA `*` $sourceB `+` $destC
     attr-dict
     `:` type($sourceA) `,` type($sourceB) `,` type($destC)
   }];
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
index 3de57c9..dcd9f95 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file declares a dialect for MLIR wrappers around AMDGPU-specific
-// intrinssics and for other AMD GPU-specific functionality.
+// intrinsics and for other AMD GPU-specific functionality.
 //
 //===----------------------------------------------------------------------===//
 
@@ -26,6 +26,29 @@
 
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.h.inc"
 
+namespace mlir::amdgpu {
+/// Parser for the `custom<MNKDimensionList>` custom assembly format used by
+/// WMMAOp.
+ParseResult parseMNKDimensionList(OpAsmParser &parser, IntegerAttr &m,
+                                  IntegerAttr &n, IntegerAttr &k);
+inline ParseResult parseMNKDimensionList(OpAsmParser &parser, Operation *,
+                                         IntegerAttr &m, IntegerAttr &n,
+                                         IntegerAttr &k) {
+  return parseMNKDimensionList(parser, m, n, k);
+}
+
+/// Printer for the `custom<MNKDimensionList>` custom assembly format used by
+/// WMMAOp.
+inline void printMNKDimensionList(OpAsmPrinter &printer, IntegerAttr m,
+                                  IntegerAttr n, IntegerAttr k) {
+  printer.printDimensionList(ArrayRef{m.getInt(), n.getInt(), k.getInt()});
+}
+inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *,
+                                  IntegerAttr m, IntegerAttr n, IntegerAttr k) {
+  printMNKDimensionList(printer, m, n, k);
+}
+} // namespace mlir::amdgpu
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
index 6e78b75..8b5934f 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
@@ -864,6 +864,34 @@ extensionComplianceMap = {
         {{bf16T, fp6e3m2T}, SpecificationVersion::V_1_1_DRAFT},
         {{bf16T, fp6e2m3T}, SpecificationVersion::V_1_1_DRAFT}},
        allOf}}},
+    {"tosa.cast_from_block_scaled",
+     {{{Extension::bf16, Extension::mxfp},
+       {{{fp4e2m1T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e2m3T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e3m2T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e4m3T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e5m2T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf},
+      {{Extension::mxfp},
+       {{{fp4e2m1T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e2m3T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e3m2T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e4m3T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e5m2T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}}}}},
+    {"tosa.cast_to_block_scaled",
+     {{{Extension::mxfp},
+       {{{bf16T, fp4e2m1T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, fp4e2m1T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, fp6e2m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, fp6e3m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, fp8e4m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16, Extension::mxfp},
+       {{{bf16T, fp6e2m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{bf16T, fp6e3m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{bf16T, fp8e4m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{bf16T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf}}},
     {"tosa.rescale",
      {{{Extension::int16},
        {{{i48T, i48T, i8T, i8T}, SpecificationVersion::V_1_0},
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 0e3df21..6e17591 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -2471,6 +2471,69 @@ def Tosa_CastOp: Tosa_Op<"cast", [Pure, SameOperandsAndResultShape,
 }
 
 //===----------------------------------------------------------------------===//
+// Operator: cast_from_block_scaled
+//===----------------------------------------------------------------------===//
+def Tosa_CastFromBlockScaledOp: Tosa_InferShapedTypeOp<"cast_from_block_scaled"> {
+  let summary = "Apply scales from a scale tensor to the values in a value tensor";
+
+  let description = [{
+    Apply the scales from a scale tensor to the values in a value tensor, casting
+    the result to the output type. The block dimension must be the last dimension
+    of the tensor.
+  }];
+
+  let arguments = (ins
+    Tosa_MXFPDataTensorAtLeast1D:$input_data,
+    Tosa_MXFPScaleTensorAtLeast1D:$input_scale,
+    Tosa_BlockSizeAttr:$block_size
+  );
+
+  let results = (outs
+    Tosa_TensorAtLeast1D: $output_data
+  );
+
+  list<Availability> availability = [
+    Profile<[Tosa_PRO_FP]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_MXFP]>,
+  ];
+
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Operator: cast_to_block_scaled
+//===----------------------------------------------------------------------===//
+def Tosa_CastToBlockScaledOp : Tosa_InferShapedTypeOp<"cast_to_block_scaled"> {
+  let summary = "Calculate scale tensor values per block, output to separate scale and data tensors.";
+
+  let description = [{
+    Calculate a scale value per block of input values and use that to calculate
+    scaled data values from an input tensor. The output tensors are cast to the
+    specified scale and value types. The block dimension will be the last dimension
+    of the tensor.
+  }];
+
+  let arguments = (ins
+    Tosa_TensorAtLeast1D:$input_data,
+    Tosa_BlockSizeAttr:$block_size
+  );
+
+  let results = (outs
+    Tosa_MXFPDataTensorAtLeast1D:$output_data,
+    Tosa_MXFPScaleTensorAtLeast1D:$output_scale
+  );
+
+  list<Availability> availability = [
+    Profile<[Tosa_PRO_FP]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_MXFP]>
+  ];
+
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
+//===----------------------------------------------------------------------===//
 // Operator: rescale
 //===----------------------------------------------------------------------===//
 def Tosa_RescaleOp : Tosa_InferShapedTypeOp<"rescale"> {
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
index b76228f..45d380c 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
@@ -79,7 +79,8 @@ private:
 
   LogicalResult populatationDispatch(Operation *op);
 
-  LogicalResult populateProfileInfo(ValueRange operands, Value output);
+  // Add input operands and output results to the profile type info list
+  LogicalResult populateProfileInfo(ValueRange operands, ValueRange results);
 
   // Base
   template <typename T>
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index 20bb961..93843e8 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -199,6 +199,16 @@ def Tosa_MXFPScaleTensor3D : AnyTypeOf<[
   TosaUnrankedTensorOf<[Tosa_MXFPScaleNumber]>,
   TosaTensorRankOf<[Tosa_MXFPScaleNumber], [3]>
 ]>;
+def Tosa_MXFPDataTensorAtLeast1D : AnyTypeOf<[
+  TosaUnrankedTensorOf<[Tosa_MXFPNumber]>,
+  TosaRankedTensorOf<[Tosa_MXFPNumber], [AtLeastRankOne]>],
+  "tosa-conformant tensor of at least rank 1", "::mlir::TensorType"
+>;
+def Tosa_MXFPScaleTensorAtLeast1D : AnyTypeOf<[
+  TosaUnrankedTensorOf<[Tosa_MXFPScaleNumber]>,
+  TosaRankedTensorOf<[Tosa_MXFPScaleNumber], [AtLeastRankOne]>],
+  "tosa-conformant tensor of at least rank 1", "::mlir::TensorType"
+>;
 
 //===----------------------------------------------------------------------===//
 // Generic scalar, vector, or tensor of a particular type.
diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td
index e1869c1..b7e168a 100644
--- a/mlir/include/mlir/IR/CommonAttrConstraints.td
+++ b/mlir/include/mlir/IR/CommonAttrConstraints.td
@@ -804,6 +804,11 @@ def IntPositivePowerOf2 : AllAttrOf<[IntPositive, IntPowerOf2]>;
 
 class IntValidAlignment<Attr attr>: ConfinedAttr<attr, [IntPositivePowerOf2]>;
 
+class IntIsOneOf<list<int> values> : AttrConstraint<
+    CPred<"::llvm::is_contained({" # !interleave(!foreach(val, values, val), ", ") #
+                                "}, ::llvm::cast<::mlir::IntegerAttr>($_self).getInt())">,
+    "whose value is one of {" # !interleave(!foreach(val, values, val), ", ") # "}">;
+
 class ArrayMaxCount<int n> : AttrConstraint<
     CPred<"::llvm::cast<::mlir::ArrayAttr>($_self).size() <= " # n>,
     "with at most " # n # " elements">;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 9b15435..478b6aa 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
@@ -993,28 +994,36 @@ mfmaOpToScaledIntrinsic(ScaledMFMAOp smfma, Chipset chipset) {
 /// on the architecture you are compiling for.
 static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
                                                   Chipset chipset) {
-  auto sourceVectorType = dyn_cast<VectorType>(wmma.getSourceA().getType());
-  auto sourceBVectorType = dyn_cast<VectorType>(wmma.getSourceB().getType());
-  auto destVectorType = dyn_cast<VectorType>(wmma.getDestC().getType());
-  auto elemSourceType = sourceVectorType.getElementType();
-  auto elemBSourceType = sourceBVectorType.getElementType();
-  auto elemDestType = destVectorType.getElementType();
-
-  if (elemSourceType.isF16() && elemDestType.isF32())
-    return ROCDL::wmma_f32_16x16x16_f16::getOperationName();
-  if (elemSourceType.isBF16() && elemDestType.isF32())
-    return ROCDL::wmma_f32_16x16x16_bf16::getOperationName();
-  if (elemSourceType.isF16() && elemDestType.isF16())
-    return ROCDL::wmma_f16_16x16x16_f16::getOperationName();
-  if (elemSourceType.isBF16() && elemDestType.isBF16())
-    return ROCDL::wmma_bf16_16x16x16_bf16::getOperationName();
-  if (elemSourceType.isInteger(8) && elemDestType.isInteger(32))
-    return ROCDL::wmma_i32_16x16x16_iu8::getOperationName();
-  if (chipset.majorVersion == 11) {
-    if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
-      return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
+  auto sourceVectorType = cast<VectorType>(wmma.getSourceA().getType());
+  auto sourceBVectorType = cast<VectorType>(wmma.getSourceB().getType());
+  auto destVectorType = cast<VectorType>(wmma.getDestC().getType());
+  Type elemSourceType = sourceVectorType.getElementType();
+  Type elemBSourceType = sourceBVectorType.getElementType();
+  Type elemDestType = destVectorType.getElementType();
+
+  const uint32_t k = wmma.getK();
+
+  if (k == 16) {
+    if (elemSourceType.isF16() && elemDestType.isF32())
+      return ROCDL::wmma_f32_16x16x16_f16::getOperationName();
+    if (elemSourceType.isBF16() && elemDestType.isF32())
+      return ROCDL::wmma_f32_16x16x16_bf16::getOperationName();
+    if (elemSourceType.isF16() && elemDestType.isF16())
+      return ROCDL::wmma_f16_16x16x16_f16::getOperationName();
+    if (elemSourceType.isBF16() && elemDestType.isBF16())
+      return ROCDL::wmma_bf16_16x16x16_bf16::getOperationName();
+    if (elemSourceType.isInteger(8) && elemDestType.isInteger(32))
+      return ROCDL::wmma_i32_16x16x16_iu8::getOperationName();
+    if (chipset.majorVersion == 11) {
+      if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
+        return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
+    }
   }
-  if (chipset.majorVersion >= 12) {
+  if (chipset.majorVersion < 12)
+    return std::nullopt;
+
+  // gfx12+
+  if (k == 16) {
     if (isa<Float8E4M3FNType>(elemSourceType) &&
         isa<Float8E4M3FNType>(elemBSourceType) && elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_fp8_fp8::getOperationName();
@@ -1027,17 +1036,18 @@ static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
     if (isa<Float8E5M2Type>(elemSourceType) &&
         isa<Float8E4M3FNType>(elemBSourceType) && elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_bf8_fp8::getOperationName();
-    if (elemSourceType.isInteger(4) && elemDestType.isInteger(32)) {
-      bool isWave64 = destVectorType.getNumElements() == 4;
-      // This is the ambiguous case. 8 inputs to the wave64 version means that
-      // we want the 16x16x32 version, but for wave32 they mean the short form.
-      bool has8Inputs = sourceVectorType.getNumElements() == 8;
-      if ((isWave64 && has8Inputs) || (!isWave64 && !has8Inputs))
-        return ROCDL::wmma_i32_16x16x32_iu4::getOperationName();
+    if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
       return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
-    }
+
+    return std::nullopt;
   }
-  return std::nullopt;
+  if (k == 32) {
+    if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
+      return ROCDL::wmma_i32_16x16x32_iu4::getOperationName();
+    return std::nullopt;
+  }
+
+  llvm_unreachable("unhandled WMMA case");
 }
 
 namespace {
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 61166db..4c4965e 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -360,45 +360,53 @@ LogicalResult ScaledExtPacked816Op::verify() {
 //===----------------------------------------------------------------------===//
 // WMMAOp
 //===----------------------------------------------------------------------===//
-LogicalResult WMMAOp::verify() {
-  Type sourceAType = getSourceA().getType();
-  Type sourceBType = getSourceB().getType();
-  Type destType = getDestC().getType();
 
-  VectorType sourceVectorAType = dyn_cast<VectorType>(sourceAType);
-  VectorType sourceVectorBType = dyn_cast<VectorType>(sourceBType);
-  VectorType destVectorType = dyn_cast<VectorType>(destType);
+ParseResult mlir::amdgpu::parseMNKDimensionList(OpAsmParser &parser,
+                                                IntegerAttr &m, IntegerAttr &n,
+                                                IntegerAttr &k) {
+  SmallVector<int64_t, 3> dimensions;
+  if (parser.parseDimensionList(dimensions, false, false))
+    return failure();
+  if (dimensions.size() != 3)
+    return parser.emitError(parser.getCurrentLocation())
+           << "expected 3 dimensions in MNK dimension list";
 
-  Type sourceAElemType = sourceVectorAType.getElementType();
-  Type sourceBElemType = sourceVectorBType.getElementType();
-  Type destElemType = destVectorType.getElementType();
+  m = parser.getBuilder().getI32IntegerAttr(dimensions[0]);
+  n = parser.getBuilder().getI32IntegerAttr(dimensions[1]);
+  k = parser.getBuilder().getI32IntegerAttr(dimensions[2]);
+  return success();
+}
 
-  if (sourceVectorAType.getNumElements() !=
-      sourceVectorBType.getNumElements()) {
+LogicalResult WMMAOp::verify() {
+  auto sourceAType = cast<VectorType>(getSourceA().getType());
+  auto sourceBType = cast<VectorType>(getSourceB().getType());
+  auto destType = cast<VectorType>(getDestC().getType());
+
+  Type sourceAElemType = sourceAType.getElementType();
+  Type sourceBElemType = sourceBType.getElementType();
+  if (sourceAType.getNumElements() != sourceBType.getNumElements()) {
     return emitOpError("source vectors have different lengths: ")
-           << sourceVectorAType << " vs. " << sourceVectorBType;
+           << sourceAType << " vs. " << sourceBType;
   }
 
-  bool isDestFloat = isa<Float32Type, Float16Type, BFloat16Type>(destElemType);
-  bool isSrcFloat =
-      isa<Float16Type, BFloat16Type, Float8E4M3FNType, Float8E5M2Type>(
-          sourceAElemType);
-
-  if (isDestFloat && !isSrcFloat) {
-    return emitOpError("Expected float sources with float destination");
-  }
+  bool isDestFloat = destType.getElementType().isFloat();
+  bool isSrcFloat = sourceAElemType.isFloat();
 
-  if (!isDestFloat && isSrcFloat) {
-    return emitOpError("Expected int sources with int destination");
-  }
+  if (isDestFloat && !isSrcFloat)
+    return emitOpError("expected float sources with float destination");
+  if (!isDestFloat && isSrcFloat)
+    return emitOpError("expected int sources with int destination");
 
-  if (sourceAElemType != sourceBElemType &&
-      !(isa<Float8E5M2Type, Float8E4M3FNType>(sourceAElemType) &&
-        isa<Float8E5M2Type, Float8E4M3FNType>(sourceBElemType))) {
+  if (!sourceAElemType.isFloat(8) && sourceAElemType != sourceBElemType) {
     return emitOpError(
                "source element types much match (except for fp8) but have ")
            << sourceAType << " and " << sourceBType;
   }
+
+  if (!sourceAElemType.isInteger(4) && getK() != 16) {
+    return emitOpError("K dimension must be 16 for source element type ")
+           << sourceAElemType;
+  }
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 6cd0eae..0aff67f 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -370,7 +370,7 @@ ParseResult parseWithEnumHandling(OpAsmParser &parser, OperationState &result) {
                                     result.operands)))
     return failure();
 
-  result.addTypes(fnTy.getResult(0));
+  result.addTypes(fnTy.getResults());
   result.addAttributes(attrs);
 
   return success();
@@ -532,6 +532,24 @@ void MatmulTBlockScaledOp::print(OpAsmPrinter &parser) {
   printWithEnumHandling(parser, *this);
 }
 
+ParseResult CastFromBlockScaledOp::parse(OpAsmParser &parser,
+                                         OperationState &result) {
+  return parseWithEnumHandling<tosa::BlockSize>(parser, result);
+}
+
+void CastFromBlockScaledOp::print(OpAsmPrinter &parser) {
+  printWithEnumHandling(parser, *this);
+}
+
+ParseResult CastToBlockScaledOp::parse(OpAsmParser &parser,
+                                       OperationState &result) {
+  return parseWithEnumHandling<tosa::BlockSize>(parser, result);
+}
+
+void CastToBlockScaledOp::print(OpAsmPrinter &parser) {
+  printWithEnumHandling(parser, *this);
+}
+
 //===----------------------------------------------------------------------===//
 // Tosa utilities.
 //===----------------------------------------------------------------------===//
@@ -3944,6 +3962,145 @@ LogicalResult RescaleOp::inferReturnTypeComponents(
   return success();
 }
 
+LogicalResult CastFromBlockScaledOp::inferReturnTypeComponents(
+    MLIRContext *context, ::std::optional<Location> location,
+    CastFromBlockScaledOp::Adaptor adaptor,
+    SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
+  const ShapeAdaptor inputShape(adaptor.getInputData().getType());
+  inferredReturnShapes.push_back(ShapedTypeComponents(inputShape));
+  return success();
+}
+
+LogicalResult CastFromBlockScaledOp::verify() {
+  const Type inputDataType = getInputData().getType();
+  const Type outputDataType = getResult().getType();
+  if (failed(verifyCompatibleShape(inputDataType, outputDataType)))
+    return emitOpError() << "require compatible shapes for input_data ("
+                         << inputDataType << ") and "
+                         << "output_data (" << outputDataType << ")";
+
+  const ShapeAdaptor inputDataShape = ShapeAdaptor(inputDataType);
+
+  if (inputDataShape.hasRank()) {
+    const unsigned int blockSize =
+        BlockSizeAttr::getBlockSizeValue(getBlockSize());
+    const int64_t inputDataLastDim =
+        inputDataShape.getDimSize(inputDataShape.getRank() - 1);
+    if (inputDataLastDim % blockSize != 0)
+      return emitOpError() << "expect last dimension of input_data ("
+                           << inputDataLastDim
+                           << ") to be divisible by block_size (" << blockSize
+                           << ")";
+
+    const Type inputScaleType = getInputScale().getType();
+    const ShapeAdaptor inputScaleShape = ShapeAdaptor(inputScaleType);
+
+    if (inputScaleShape.hasRank()) {
+      SmallVector<int64_t> inputDataDims, inputScaleDims;
+      inputDataShape.getDims(inputDataDims);
+      inputScaleShape.getDims(inputScaleDims);
+
+      if (inputDataDims.size() != inputScaleDims.size() ||
+          failed(verifyCompatibleShape(
+              ArrayRef<int64_t>(inputDataDims).drop_back(1),
+              ArrayRef<int64_t>(inputScaleDims).drop_back(1))))
+        return emitOpError() << "require compatible shapes for input_data ("
+                             << inputDataType << ") and "
+                             << "input_scale (" << inputScaleType
+                             << ") except for the last dimension";
+
+      const SmallVector<int64_t, 2> dimsToCheck{inputDataLastDim / blockSize,
+                                                inputScaleDims.back()};
+      if (ShapedType::isStatic(inputDataLastDim) &&
+          failed(verifyCompatibleDims(dimsToCheck)))
+        return emitOpError()
+               << "expect last dimension of input_scale ("
+               << inputScaleDims.back()
+               << ") to be equal to last dimension of input_data / block_size ("
+               << inputDataDims.back() / blockSize << ")";
+    }
+  }
+
+  return success();
+}
+
+LogicalResult CastToBlockScaledOp::inferReturnTypeComponents(
+    MLIRContext *context, ::std::optional<Location> location,
+    CastToBlockScaledOp::Adaptor adaptor,
+    SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
+  const ShapeAdaptor inputShape(adaptor.getInputData().getType());
+  inferredReturnShapes.push_back(ShapedTypeComponents(inputShape));
+  if (!inputShape.hasRank())
+    return success();
+
+  // Calculate output_scale shape if ranked input provided
+  SmallVector<int64_t> outputScaleShape;
+  inputShape.getDims(outputScaleShape);
+  const int64_t lastDimLoc = inputShape.getRank() - 1;
+  const int64_t lastDimSize = inputShape.getDimSize(lastDimLoc);
+  if (ShapedType::isStatic(lastDimSize)) {
+    const unsigned int blockSize =
+        BlockSizeAttr::getBlockSizeValue(adaptor.getBlockSize());
+    outputScaleShape[lastDimLoc] = lastDimSize / blockSize;
+  }
+  inferredReturnShapes.push_back(ShapedTypeComponents(outputScaleShape));
+  return success();
+}
+
+LogicalResult CastToBlockScaledOp::verify() {
+  const Type inputDataType = getInputData().getType();
+  const Type outputDataType = getResult(0).getType();
+  if (failed(verifyCompatibleShape(inputDataType, outputDataType)))
+    return emitOpError() << "require compatible shapes for input_data ("
+                         << inputDataType << ") and "
+                         << "output_data (" << outputDataType << ")";
+
+  const unsigned int blockSize =
+      BlockSizeAttr::getBlockSizeValue(getBlockSize());
+  const ShapeAdaptor inputDataShape = ShapeAdaptor(inputDataType);
+  if (inputDataShape.hasRank()) {
+    const int64_t inputDataLastDim =
+        inputDataShape.getDimSize(inputDataShape.getRank() - 1);
+    if (ShapedType::isStatic(inputDataLastDim) &&
+        inputDataLastDim % blockSize != 0)
+      return emitOpError() << "expect last dimension of input_data ("
+                           << inputDataLastDim
+                           << ") to be divisible by block_size (" << blockSize
+                           << ")";
+  }
+
+  const ShapeAdaptor outputDataShape = ShapeAdaptor(outputDataType);
+  const Type outputScaleType = getResult(1).getType();
+  const ShapeAdaptor outputScaleShape = ShapeAdaptor(outputScaleType);
+  if (outputDataShape.hasRank() && outputScaleShape.hasRank()) {
+    SmallVector<int64_t> outputDataDims, outputScaleDims;
+    outputDataShape.getDims(outputDataDims);
+    outputScaleShape.getDims(outputScaleDims);
+
+    if (outputDataDims.size() != outputScaleDims.size() ||
+        failed(verifyCompatibleShape(
+            ArrayRef<int64_t>(outputDataDims).drop_back(1),
+            ArrayRef<int64_t>(outputScaleDims).drop_back(1))))
+      return emitOpError() << "require compatible shapes for output_data ("
+                           << outputDataType << ") and "
+                           << "output_scale (" << outputScaleType
+                           << ") except for the last dimension";
+
+    const int64_t outputDataLastDim = outputDataDims.back();
+    const SmallVector<int64_t, 2> dimsToCheck{outputDataLastDim / blockSize,
+                                              outputScaleDims.back()};
+    if (ShapedType::isStatic(outputDataLastDim) &&
+        failed(verifyCompatibleDims(dimsToCheck)))
+      return emitOpError()
+             << "expect last dimension of output_scale ("
+             << outputScaleDims.back()
+             << ") to be equal to last dimension of output_data / block_size ("
+             << outputDataDims.back() / blockSize << ")";
+  }
+
+  return success();
+}
+
 LogicalResult IfOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
     IfOp::Adaptor adaptor,
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
index 53afc5d..ab363ee 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
@@ -51,10 +51,11 @@ TosaProfileCompliance::getProfileComplianceMap() {
 
 // Base populating function
 LogicalResult ProfileInfoDepot::populateProfileInfo(ValueRange operands,
-                                                    Value output) {
-  for (auto operand : operands)
+                                                    ValueRange results) {
+  for (const auto &operand : operands)
     addValue(operand);
-  addValue(output);
+  for (const auto &result : results)
+    addValue(result);
   return success();
 }
 
@@ -177,23 +178,6 @@ LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::ResizeOp op) {
 }
 
 template <>
-LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::FFT2dOp op) {
-  addValue(op.getInputReal());
-  addValue(op.getInputImag());
-  addValue(op.getOutputReal());
-  addValue(op.getOutputImag());
-  return success();
-}
-
-template <>
-LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::RFFT2dOp op) {
-  addValue(op.getInputReal());
-  addValue(op.getOutputReal());
-  addValue(op.getOutputImag());
-  return success();
-}
-
-template <>
 LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::SelectOp op) {
   addValue(op.getOnTrue());
   addValue(op.getOnFalse());
@@ -246,7 +230,7 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
 // This helper function populates the info for all operands.
 #define POPULATE_PROFILE_INFO_COMMON(tosaOp)                                   \
   if (isa<tosa::tosaOp##Op>(op)) {                                             \
-    return populateProfileInfo(op->getOperands(), op->getResult(0));           \
+    return populateProfileInfo(op->getOperands(), op->getResults());           \
   }
 
   // Skip irrelevant operands when they are independent and not tied to any
@@ -257,8 +241,6 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
   POPULATE_PROFILE_INFO_CUSTOM(Conv3D)
   POPULATE_PROFILE_INFO_CUSTOM(DepthwiseConv2D)
   POPULATE_PROFILE_INFO_CUSTOM(Mul)
-  POPULATE_PROFILE_INFO_CUSTOM(FFT2d)
-  POPULATE_PROFILE_INFO_CUSTOM(RFFT2d)
   POPULATE_PROFILE_INFO_CUSTOM(Concat)
   POPULATE_PROFILE_INFO_CUSTOM(Pad)
   POPULATE_PROFILE_INFO_CUSTOM(Reshape)
@@ -277,7 +259,11 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
   // For the most of tosa operators, all operands are profile/extension related
   // and hence are all considered in this profile-based compilance check.
   POPULATE_PROFILE_INFO_COMMON(MatmulTBlockScaled)
+  POPULATE_PROFILE_INFO_COMMON(FFT2d)
+  POPULATE_PROFILE_INFO_COMMON(RFFT2d)
   POPULATE_PROFILE_INFO_COMMON(Cast)
+  POPULATE_PROFILE_INFO_COMMON(CastFromBlockScaled)
+  POPULATE_PROFILE_INFO_COMMON(CastToBlockScaled)
   POPULATE_PROFILE_INFO_COMMON(Const)
   POPULATE_PROFILE_INFO_COMMON(ArgMax)
   POPULATE_PROFILE_INFO_COMMON(Sub)
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index cb544ad..4d0b61a 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -635,6 +635,8 @@ LogicalResult TosaValidation::levelCheckRanksAndSizes(Operation *op) {
   CHECK_RANKS_AND_SIZES(Transpose);
   // Type Conversion
   CHECK_RANKS_AND_SIZES(Cast);
+  CHECK_RANKS_AND_SIZES(CastFromBlockScaled);
+  CHECK_RANKS_AND_SIZES(CastToBlockScaled);
   CHECK_RANKS_AND_SIZES(Rescale);
   // Control Flow Operators
   CHECK_RANKS_AND_SIZES(If);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index f77784a..2c37140 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -145,8 +145,26 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
   xegpu::DistributeLayoutAttr layout =
       xegpu::getDistributeLayoutAttr(operandOrResult);
   if (layout && layout.isForSubgroup()) {
-    if (!layout.getEffectiveInstDataAsInt().empty())
-      return layout.getEffectiveInstDataAsInt();
+    if (!layout.getEffectiveInstDataAsInt().empty()) {
+      SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
+      // Remove leading unit dimensions from inst_data
+      // For example, if the inst_data is [1, 1, 32]
+      // it will pass [32] as the unroll/blocking size.
+      // Skip it for xegpu nd ops since it will be 2D
+      // TODO: For vectors ops, experiment with the
+      // upstream vector remove leading unit dims patterns,
+      // populateCastAwayVectorLeadingOneDimPatterns.
+      Operation *definingOp = value.getDefiningOp();
+      bool skipLeadingUnitDimRemoval =
+          definingOp &&
+          (isa<xegpu::CreateNdDescOp, xegpu::LoadNdOp, xegpu::DpasOp,
+               xegpu::StoreNdOp, xegpu::PrefetchNdOp>(definingOp));
+      if (!skipLeadingUnitDimRemoval) {
+        auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
+        instData.erase(instData.begin(), it);
+      }
+      return instData;
+    }
 
     if (auto type = dyn_cast<ShapedType>(value.getType()))
       return llvm::to_vector(type.getShape());
@@ -354,7 +372,6 @@ void XeGPUBlockingPass::runOnOperation() {
           // To create a new attribute with a different chunk_size:
           auto newEncoding = xegpu::ScatterTensorDescAttr::get(
               ctx, tdescTy.getMemorySpace(), blockedChunkSize);
-
           encoding = newEncoding;
         }
       }
@@ -363,7 +380,7 @@ void XeGPUBlockingPass::runOnOperation() {
           xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
                                      tdescTy.getLayoutAttr().dropInstData());
     } else {
-      newTy = type.clone(tileShape, elemTy);
+      newTy = VectorType::get(tileShape, elemTy);
     }
 
     if (returnSingleType)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index aafa1b7..e6e71cc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -66,8 +66,6 @@ protected:
   Value unpack(ValueRange srcs, Type destTy, ArrayRef<int64_t> blockSize,
                Location loc, PatternRewriter &rewriter) const {
     if (auto vecTy = dyn_cast<VectorType>(destTy)) {
-      assert(vecTy.getRank() == static_cast<int64_t>(blockSize.size()) &&
-             "Expecting blockSize size to match the rank of destTy.");
       auto shape = vecTy.getShape();
       return xegpu::createVectorWithShapeFromValues(rewriter, loc, srcs, shape);
     }
@@ -93,8 +91,6 @@ protected:
                           ArrayRef<int64_t> blockSize, Location loc,
                           PatternRewriter &rewriter) const {
     if (auto vecTy = dyn_cast<VectorType>(src.getType())) {
-      assert(vecTy.getRank() == static_cast<int64_t>(blockSize.size()) &&
-             "Expecting blockSize size to match the rank of src.");
       return xegpu::extractVectorsWithShapeFromValue(rewriter, loc, src,
                                                      blockSize);
     }
@@ -635,7 +631,7 @@ struct UnrollLoadGatherOpWithOffset
     VectorType maskTy = llvm::dyn_cast<VectorType>(mask.getType());
     VectorType offsetsTy = llvm::dyn_cast<VectorType>(offsets.getType());
     Type elemTy = valueTy.getElementType();
-    VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
+    VectorType newValueTy = VectorType::get(*targetShape, elemTy);
 
     SmallVector<Type> convertedMaskTypes;
     SmallVector<Value> convertedMasks;
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 2c56a43..b4605cd 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -246,11 +246,28 @@ xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
   if (!computeShapeRatio(srcShape, shape))
     return {value};
 
+  int64_t srcShapeRank = srcShape.size();
+  int64_t targetShapeRank = shape.size();
+
+  SmallVector<int64_t> adjustedTargetShape(srcShape.size());
+  int64_t rankDiff = srcShapeRank - targetShapeRank;
+  std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff,
+            1);
+  std::copy(shape.begin(), shape.end(), adjustedTargetShape.begin() + rankDiff);
+
   SmallVector<Value> result;
-  for (SmallVector<int64_t> offsets : StaticTileOffsetRange(srcShape, shape)) {
+  for (SmallVector<int64_t> offsets :
+       StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
     SmallVector<int64_t> staticStrides(offsets.size(), 1);
-    result.push_back(vector::ExtractStridedSliceOp::create(
-        builder, loc, value, offsets, shape, staticStrides));
+    Value slice = vector::ExtractStridedSliceOp::create(
+        builder, loc, value, offsets, adjustedTargetShape, staticStrides);
+
+    // Reshape to remove leading unit dims if needed
+    if (srcShapeRank > targetShapeRank) {
+      auto targetTy = VectorType::get(shape, vecTy.getElementType());
+      slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
+    }
+    result.push_back(slice);
   }
 
   return result;
@@ -274,7 +291,7 @@ Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc,
 
   for (auto [src, offsets] :
        llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) {
-    SmallVector<int64_t> staticStrides(offsets.size(), 1);
+    SmallVector<int64_t> staticStrides(tileShape.size(), 1);
     result = vector::InsertStridedSliceOp::create(builder, loc, src, result,
                                                   offsets, staticStrides);
   }
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir
index 638a7c3..d1301d0 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/wmma.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir
@@ -1,35 +1,36 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 --allow-unregistered-dialect | FileCheck %s
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 --allow-unregistered-dialect | FileCheck %s
+
 // CHECK-LABEL: @wmma_to_rocdl
 func.func @wmma_to_rocdl(%arg0 : vector<16xf16>, %arg1 : vector<8xf32>, %arg2 : vector<4xf32>,
                          %arg3 : vector<16xbf16>, %arg4 : vector<8xf16>, %arg5 : vector<8xbf16>,
                          %arg6 : vector<16xi8>, %arg7 : vector<8xi32>, %arg8 : vector<4xi32>,
                          %arg9 : vector<16xui8>, %arg10 : vector<16xi4>, %arg11 : vector<8xi4>) {
   // CHECK: rocdl.wmma.f32.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg0 * %arg0 + %arg2 : vector<16xf16>, vector<16xf16>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg2 : vector<16xf16>, vector<16xf16>, vector<4xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.bf16{{.*}}: (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg3 * %arg3 + %arg1 : vector<16xbf16>, vector<16xbf16>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg3 * %arg3 + %arg1 : vector<16xbf16>, vector<16xbf16>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.bf16{{.*}}: (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg3 * %arg3 + %arg2 : vector<16xbf16>, vector<16xbf16>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg3 * %arg3 + %arg2 : vector<16xbf16>, vector<16xbf16>, vector<4xf32>
   // CHECK: rocdl.wmma.f16.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
-  amdgpu.wmma %arg0 * %arg0 + %arg0 {subwordOffset = 1 : i32}: vector<16xf16>, vector<16xf16>, vector<16xf16>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg0 {subwordOffset = 1 : i32}: vector<16xf16>, vector<16xf16>, vector<16xf16>
   // CHECK: rocdl.wmma.f16.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) -> vector<8xf16>
-  amdgpu.wmma %arg0 * %arg0 + %arg4 {subwordOffset = 0 : i32}: vector<16xf16>, vector<16xf16>, vector<8xf16>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg4 {subwordOffset = 0 : i32}: vector<16xf16>, vector<16xf16>, vector<8xf16>
   // CHECK: %[[raw_bf16x16:.+]] = rocdl.wmma.bf16.16x16x16.bf16{{.*}}: (vector<16xi16>, vector<16xi16>, vector<16xi16>, i1) -> vector<16xi16>
   // CHECK-NEXT: llvm.bitcast %[[raw_bf16x16]] : vector<16xi16> to vector<16xbf16>
-  amdgpu.wmma %arg3 * %arg3 + %arg3 {subwordOffset = 1 : i32}: vector<16xbf16>, vector<16xbf16>, vector<16xbf16>
+  amdgpu.wmma 16x16x16 %arg3 * %arg3 + %arg3 {subwordOffset = 1 : i32}: vector<16xbf16>, vector<16xbf16>, vector<16xbf16>
   // CHECK: %[[raw_bf16x8:.+]] = rocdl.wmma.bf16.16x16x16.bf16{{.*}}: (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) -> vector<8xi16>
   // CHECK-NEXT: llvm.bitcast %[[raw_bf16x8]] : vector<8xi16> to vector<8xbf16>
-  amdgpu.wmma %arg3 * %arg3 + %arg5 {subwordOffset = 0 : i32}: vector<16xbf16>, vector<16xbf16>, vector<8xbf16>
+  amdgpu.wmma 16x16x16 %arg3 * %arg3 + %arg5 {subwordOffset = 0 : i32}: vector<16xbf16>, vector<16xbf16>, vector<8xbf16>
   // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}: (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32>
-  amdgpu.wmma %arg6 * %arg6 + %arg7 {clamp}: vector<16xi8>, vector<16xi8>, vector<8xi32>
+  amdgpu.wmma 16x16x16 %arg6 * %arg6 + %arg7 {clamp}: vector<16xi8>, vector<16xi8>, vector<8xi32>
   // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}: (i1, vector<4xi32>, i1, vector<4xi32>, vector<4xi32>, i1) -> vector<4xi32>
-  amdgpu.wmma %arg9 * %arg9 + %arg8 {unsignedA, unsignedB, clamp}: vector<16xui8>, vector<16xui8>, vector<4xi32>
+  amdgpu.wmma 16x16x16 %arg9 * %arg9 + %arg8 {unsignedA, unsignedB, clamp}: vector<16xui8>, vector<16xui8>, vector<4xi32>
   // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
-  amdgpu.wmma %arg10 * %arg10 + %arg7 {clamp}: vector<16xi4>, vector<16xi4>, vector<8xi32>
+  amdgpu.wmma 16x16x16 %arg10 * %arg10 + %arg7 {clamp}: vector<16xi4>, vector<16xi4>, vector<8xi32>
   // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (i1, i32, i1, i32, vector<4xi32>, i1) -> vector<4xi32>
-  amdgpu.wmma %arg11 * %arg11 + %arg8 {clamp}: vector<8xi4>, vector<8xi4>, vector<4xi32>
+  amdgpu.wmma 16x16x16 %arg11 * %arg11 + %arg8 {clamp}: vector<8xi4>, vector<8xi4>, vector<4xi32>
 
   func.return
 }
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir
index 94a1b78..b8973233 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1200 --allow-unregistered-dialect | FileCheck %s
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1200 --allow-unregistered-dialect | FileCheck %s
 // CHECK-LABEL: @wmma_to_rocdl
 func.func @wmma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<4xf16>,
                          %arg2 : vector<8xf32>, %arg3 : vector<4xf32>,
@@ -9,60 +9,60 @@ func.func @wmma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<4xf16>,
                          %arg12 : vector<8xi32>, %arg13 : vector<4xi32>,
                          %arg14 : vector<16xi4>, %arg15 : vector<8xi4>, %arg16 : vector<4xi4>) {
   // CHECK: rocdl.wmma.f32.16x16x16.f16{{.*}}: (vector<8xf16>, vector<8xf16>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg0 * %arg0 + %arg2 : vector<8xf16>, vector<8xf16>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg2 : vector<8xf16>, vector<8xf16>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.f16{{.*}}: (vector<4xf16>, vector<4xf16>, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg1 * %arg1 + %arg3 : vector<4xf16>, vector<4xf16>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg1 * %arg1 + %arg3 : vector<4xf16>, vector<4xf16>, vector<4xf32>
 
   // CHECK: rocdl.wmma.f32.16x16x16.bf16{{.*}}: (vector<8xi16>, vector<8xi16>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg4 * %arg4 + %arg2 : vector<8xbf16>, vector<8xbf16>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg4 * %arg4 + %arg2 : vector<8xbf16>, vector<8xbf16>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.bf16{{.*}}: (vector<4xi16>, vector<4xi16>, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg5 * %arg5 + %arg3 : vector<4xbf16>, vector<4xbf16>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg5 * %arg5 + %arg3 : vector<4xbf16>, vector<4xbf16>, vector<4xf32>
 
   // CHECK: rocdl.wmma.f16.16x16x16.f16{{.*}}: (vector<8xf16>, vector<8xf16>, vector<8xf16>, i1) -> vector<8xf16>
-  amdgpu.wmma %arg0 * %arg0 + %arg0 : vector<8xf16>, vector<8xf16>, vector<8xf16>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg0 : vector<8xf16>, vector<8xf16>, vector<8xf16>
   // CHECK: rocdl.wmma.f16.16x16x16.f16{{.*}}: (vector<4xf16>, vector<4xf16>, vector<4xf16>, i1) -> vector<4xf16>
-  amdgpu.wmma %arg1 * %arg1 + %arg1 : vector<4xf16>, vector<4xf16>, vector<4xf16>
+  amdgpu.wmma 16x16x16 %arg1 * %arg1 + %arg1 : vector<4xf16>, vector<4xf16>, vector<4xf16>
 
   // CHECK: %[[raw_bf16x8:.+]] = rocdl.wmma.bf16.16x16x16.bf16{{.*}}: (vector<8xi16>, vector<8xi16>, vector<8xi16>, i1) -> vector<8xi16>
   // CHECK-NEXT: llvm.bitcast %[[raw_bf16x8]] : vector<8xi16> to vector<8xbf16>
-  amdgpu.wmma %arg4 * %arg4 + %arg4 : vector<8xbf16>, vector<8xbf16>, vector<8xbf16>
+  amdgpu.wmma 16x16x16 %arg4 * %arg4 + %arg4 : vector<8xbf16>, vector<8xbf16>, vector<8xbf16>
   // CHECK: rocdl.wmma.bf16.16x16x16.bf16{{.*}}: (vector<4xi16>, vector<4xi16>, vector<4xi16>, i1) -> vector<4xi16>
-  amdgpu.wmma %arg5 * %arg5 + %arg5 : vector<4xbf16>, vector<4xbf16>, vector<4xbf16>
+  amdgpu.wmma 16x16x16 %arg5 * %arg5 + %arg5 : vector<4xbf16>, vector<4xbf16>, vector<4xbf16>
 
   // CHECK: rocdl.wmma.f32.16x16x16.fp8_fp8{{.*}}: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg6 * %arg6 + %arg2 : vector<8xf8E4M3FN>, vector<8xf8E4M3FN>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg6 * %arg6 + %arg2 : vector<8xf8E4M3FN>, vector<8xf8E4M3FN>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.fp8_fp8{{.*}}: (i32, i32, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg7 * %arg7 + %arg3 : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg7 * %arg7 + %arg3 : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, vector<4xf32>
 
   // CHECK: rocdl.wmma.f32.16x16x16.fp8_bf8{{.*}}: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg6 * %arg8 + %arg2 : vector<8xf8E4M3FN>, vector<8xf8E5M2>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg6 * %arg8 + %arg2 : vector<8xf8E4M3FN>, vector<8xf8E5M2>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.fp8_bf8{{.*}}: (i32, i32, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg7 * %arg9 + %arg3 : vector<4xf8E4M3FN>, vector<4xf8E5M2>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg7 * %arg9 + %arg3 : vector<4xf8E4M3FN>, vector<4xf8E5M2>, vector<4xf32>
 
   // CHECK: rocdl.wmma.f32.16x16x16.bf8_bf8{{.*}}: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg8 * %arg8 + %arg2 : vector<8xf8E5M2>, vector<8xf8E5M2>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg8 * %arg8 + %arg2 : vector<8xf8E5M2>, vector<8xf8E5M2>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.bf8_bf8{{.*}}: (i32, i32, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg9 * %arg9 + %arg3 : vector<4xf8E5M2>, vector<4xf8E5M2>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg9 * %arg9 + %arg3 : vector<4xf8E5M2>, vector<4xf8E5M2>, vector<4xf32>
 
   // CHECK: rocdl.wmma.f32.16x16x16.bf8_fp8{{.*}}: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg8 * %arg6 + %arg2 : vector<8xf8E5M2>, vector<8xf8E4M3FN>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg8 * %arg6 + %arg2 : vector<8xf8E5M2>, vector<8xf8E4M3FN>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.bf8_fp8{{.*}}: (i32, i32, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg9 * %arg7 + %arg3 : vector<4xf8E5M2>, vector<4xf8E4M3FN>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg9 * %arg7 + %arg3 : vector<4xf8E5M2>, vector<4xf8E4M3FN>, vector<4xf32>
 
   // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}: (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
-  amdgpu.wmma %arg10 * %arg10 + %arg12 {clamp} : vector<8xi8>, vector<8xi8>, vector<8xi32>
+  amdgpu.wmma 16x16x16 %arg10 * %arg10 + %arg12 {clamp} : vector<8xi8>, vector<8xi8>, vector<8xi32>
   // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}: (i1, i32, i1, i32, vector<4xi32>, i1) -> vector<4xi32>
-  amdgpu.wmma %arg11 * %arg11 + %arg13 {unsignedA, unsignedB, clamp}: vector<4xi8>, vector<4xi8>, vector<4xi32>
+  amdgpu.wmma 16x16x16 %arg11 * %arg11 + %arg13 {unsignedA, unsignedB, clamp}: vector<4xi8>, vector<4xi8>, vector<4xi32>
 
   // CHECK: rocdl.wmma.i32.16x16x32.iu4{{.*}}: (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
-  amdgpu.wmma %arg14 * %arg14 + %arg12 {clamp} : vector<16xi4>, vector<16xi4>, vector<8xi32>
+  amdgpu.wmma 16x16x32 %arg14 * %arg14 + %arg12 {clamp} : vector<16xi4>, vector<16xi4>, vector<8xi32>
   // CHECK: rocdl.wmma.i32.16x16x32.iu4{{.*}}: (i1, i32, i1, i32, vector<4xi32>, i1) -> vector<4xi32>
-  amdgpu.wmma %arg15 * %arg15 + %arg13 {clamp} : vector<8xi4>, vector<8xi4>, vector<4xi32>
+  amdgpu.wmma 16x16x32 %arg15 * %arg15 + %arg13 {clamp} : vector<8xi4>, vector<8xi4>, vector<4xi32>
 
   // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (i1, i32, i1, i32, vector<8xi32>, i1) -> vector<8xi32>
-  amdgpu.wmma %arg15 * %arg15 + %arg12 {clamp} : vector<8xi4>, vector<8xi4>, vector<8xi32>
+  amdgpu.wmma 16x16x16 %arg15 * %arg15 + %arg12 {clamp} : vector<8xi4>, vector<8xi4>, vector<8xi32>
   // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (i1, i32, i1, i32, vector<4xi32>, i1) -> vector<4xi32>
-  amdgpu.wmma %arg16 * %arg16 + %arg13 {clamp} : vector<4xi4>, vector<4xi4>, vector<4xi32>
+  amdgpu.wmma 16x16x16 %arg16 * %arg16 + %arg13 {clamp} : vector<4xi4>, vector<4xi4>, vector<4xi32>
 
   func.return
 }
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index a8256b1..6a2518a 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -120,9 +120,49 @@ func.func @no_negation(%a: f32, %b: f32, %c: vector<32xf32>) -> vector<32xf32> {
 
 // -----
 
-func.func @wmma(%arg0 : vector<16xf16>, %arg1 : vector<8xi32>) -> vector<8xi32> {
-  // expected-error@+1 {{'amdgpu.wmma' op Expected int sources with int destination}}
-  %0 = amdgpu.wmma %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xi32>
+func.func @wmma_f16_i32(%arg0 : vector<16xf16>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' op expected int sources with int destination}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
+// -----
+
+func.func @wmma_i16_f32(%arg0 : vector<16xi8>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op expected float sources with float destination}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+func.func @wmma_no_k_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' expected 3 dimensions in MNK dimension list}}
+  %0 = amdgpu.wmma 16x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
+// -----
+
+func.func @wmma_wrong_m_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16}}}
+  %0 = amdgpu.wmma 32x16x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
+// -----
+
+func.func @wmma_wrong_n_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16}}}
+  %0 = amdgpu.wmma 16x32x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
+// -----
+
+func.func @wmma_wrong_k_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
+  %0 = amdgpu.wmma 16x16x24 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
   func.return %0 : vector<8xi32>
 }
 
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index f9c6899..a185eb6 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -565,13 +565,20 @@ func.func @mfma(%arg0 : f32, %arg1 : vector<32xf32>) -> vector<32xf32> {
   func.return %0 : vector<32xf32>
 }
 
-// CHECK-LABEL: func @wmma
-func.func @wmma(%arg0 : vector<16xf16>, %arg1 : vector<8xf16>) -> vector<8xf16> {
-  // CHECK: amdgpu.wmma
-  %0 = amdgpu.wmma %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xf16>
+// CHECK-LABEL: func @wmma_f16_16x16x16_f16
+func.func @wmma_f16_16x16x16_f16(%arg0 : vector<16xf16>, %arg1 : vector<8xf16>) -> vector<8xf16> {
+  // CHECK: amdgpu.wmma 16x16x16
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xf16>
   func.return %0 : vector<8xf16>
 }
 
+// CHECK-LABEL: func @wmma_i32_16x16x32_i4
+func.func @wmma_i32_16x16x32_i4(%arg0 : vector<16xi4>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // CHECK: amdgpu.wmma 16x16x32
+  %0 = amdgpu.wmma 16x16x32 %arg0 * %arg0 + %arg1 : vector<16xi4>, vector<16xi4>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
 // CHECK-LABEL: func @swizzle_bitmode
 func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
   // CHECK: amdgpu.swizzle_bitmode
diff --git a/mlir/test/Dialect/Tosa/availability.mlir b/mlir/test/Dialect/Tosa/availability.mlir
index c138ac9..a05f423 100644
--- a/mlir/test/Dialect/Tosa/availability.mlir
+++ b/mlir/test/Dialect/Tosa/availability.mlir
@@ -696,3 +696,21 @@ func.func @test_const_shape() -> !tosa.shape<4> {
   return %cst : !tosa.shape<4>
 }
 
+// -----
+// CHECK-LABEL: test_cast_from_block_scaled
+func.func @test_cast_from_block_scaled(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  // CHECK: profiles: [ [pro_fp] ]
+  // CHECK: extensions: [ [bf16, mxfp] ]
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+// CHECK-LABEL: test_cast_to_block_scaled
+func.func @test_cast_to_block_scaled(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) {
+  // CHECK: profiles: [ [pro_fp] ]
+  // CHECK: extensions: [ [bf16, mxfp] ]
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = BLOCK_SIZE_32, stochastic_round = false} : (tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>
+}
+
diff --git a/mlir/test/Dialect/Tosa/invalid_extension.mlir b/mlir/test/Dialect/Tosa/invalid_extension.mlir
index ab048af..68a9578 100644
--- a/mlir/test/Dialect/Tosa/invalid_extension.mlir
+++ b/mlir/test/Dialect/Tosa/invalid_extension.mlir
@@ -549,7 +549,6 @@ func.func @test_matmul_t_block_scaled(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: ten
 
 // -----
 
-// CHECK-LABEL: test_argmax_int64
 func.func @test_argmax_int64(%arg0: tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64> {
   // expected-error@+1 {{'tosa.argmax' op illegal: requires [int64] but not enabled in target}}
   %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64>
@@ -569,3 +568,19 @@ func.func @test_cast_f4e2m1(%arg0: tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xb
   %0 = tosa.cast %arg0 : (tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xbf16>
   return %0 : tensor<13x21x3xbf16>
 }
+
+// -----
+
+func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf8E5M2>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op illegal: requires [mxfp] but not enabled in target}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf8E5M2>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op illegal: requires [mxfp] but not enabled in target}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir
index 8771e6e..a708764 100644
--- a/mlir/test/Dialect/Tosa/level_check.mlir
+++ b/mlir/test/Dialect/Tosa/level_check.mlir
@@ -1625,9 +1625,40 @@ func.func @test_unranked_weight_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor
 
 // -----
 
-// CHECK-LABEL: test_matmul_t_block_scaled_invalid_size
 func.func @test_matmul_t_block_scaled_invalid_size(%arg0: tensor<4x8x536870912xf4E2M1FN>, %arg1: tensor<4x8x16777216xf8E8M0FNU>, %arg2: tensor<4x16x536870912xf4E2M1FN>, %arg3: tensor<4x16x16777216xf8E8M0FNU>) -> tensor<*xf32> {
   // expected-error@+1 {{'tosa.matmul_t_block_scaled' op failed level check: operand tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}}
   %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x536870912xf4E2M1FN>, tensor<4x8x16777216xf8E8M0FNU>, tensor<4x16x536870912xf4E2M1FN>, tensor<4x16x16777216xf8E8M0FNU>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
+
+// -----
+
+func.func @test_cast_from_block_scaled_invalid_size(%arg0: tensor<67108864x32xf6E2M3FN>, %arg1: tensor<67108864x1xf8E8M0FNU>) -> tensor<67108864x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op failed level check: operand tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<67108864x32xf6E2M3FN>, tensor<67108864x1xf8E8M0FNU>) -> tensor<67108864x32xf32>
+  return %0 : tensor<67108864x32xf32>
+}
+
+// -----
+
+func.func @test_cast_from_block_scaled_invalid_rank(%arg0: tensor<1x2x3x4x5x6x7x32xf6E2M3FN>, %arg1: tensor<1x2x3x4x5x6x7x1xf8E8M0FNU>) -> tensor<1x2x3x4x5x6x7x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op failed level check: operand rank(shape) <= MAX_RANK}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<1x2x3x4x5x6x7x32xf6E2M3FN>, tensor<1x2x3x4x5x6x7x1xf8E8M0FNU>) -> tensor<1x2x3x4x5x6x7x32xf32>
+  return %0 : tensor<1x2x3x4x5x6x7x32xf32>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_invalid_size(%arg0: tensor<67108864x32xf32>) -> (tensor<67108864x32xf6E2M3FN>, tensor<67108864x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op failed level check: operand tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<67108864x32xf32>) -> (tensor<67108864x32xf6E2M3FN>, tensor<67108864x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<67108864x32xf6E2M3FN>, tensor<67108864x1xf8E8M0FNU>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_invalid_rank(%arg0: tensor<1x2x3x4x5x6x7x32xf32>) -> (tensor<1x2x3x4x5x6x7x32xf6E2M3FN>, tensor<1x2x3x4x5x6x7x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op failed level check: operand rank(shape) <= MAX_RANK}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<1x2x3x4x5x6x7x32xf32>) -> (tensor<1x2x3x4x5x6x7x32xf6E2M3FN>, tensor<1x2x3x4x5x6x7x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<1x2x3x4x5x6x7x32xf6E2M3FN>, tensor<1x2x3x4x5x6x7x1xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 9bf36b5..865f712 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -1268,3 +1268,31 @@ func.func @test_matmul_t_block_scaled_broadcast(%arg0: tensor<?x8x32xf8E4M3FN>,
   %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<?x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<?x16x32xf8E4M3FN>, tensor<1x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
   return %0 : tensor<4x8x16xf32>
 }
+
+// -----
+// CHECK-LABEL: test_cast_from_block_scaled_static
+func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+// CHECK-LABEL: test_cast_from_block_scaled_unranked
+func.func @test_cast_from_block_scaled_unranked(%arg0: tensor<*xf4E2M1FN>, %arg1: tensor<*xf8E8M0FNU>) -> tensor<*xf32> {
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+// CHECK-LABEL: test_cast_to_block_scaled_static
+func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) {
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>
+}
+
+// -----
+// CHECK-LABEL: test_cast_to_block_scaled_unranked
+func.func @test_cast_to_block_scaled_unranked(%arg0: tensor<*xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>) {
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir
index 0271d71..7de7b85 100644
--- a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir
+++ b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir
@@ -332,3 +332,17 @@ func.func @test_matmul_t_block_scaled(%arg0: tensor<4x8x32xf6E3M2FN>, %arg1: ten
   %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf6E3M2FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf6E3M2FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
   return %0 : tensor<4x8x16xf32>
 }
+
+// -----
+func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op illegal: requires [pro_fp] but not enabled in target}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op illegal: requires [pro_fp] but not enabled in target}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index 72479fe..54556a0 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -1628,3 +1628,48 @@ func.func @test_matmul_t_block_scaled_broadcast_b_scale(%arg0: tensor<*xf8E4M3FN
   %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<1x4x1xf8E8M0FNU>) -> tensor<?x?x?xf32>
   return %0 : tensor<?x?x?xf32>
 }
+
+// -----
+
+// CHECK-LABEL: test_cast_from_block_scaled_static
+func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<*xf32> {
+  // CHECK: -> tensor<4x32xf32>
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_from_block_scaled_unranked_input_scale
+func.func @test_cast_from_block_scaled_unranked_input_scale(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<*xf8E8M0FNU>) -> tensor<*xf32> {
+  // CHECK: -> tensor<4x32xf32>
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<*xf8E8M0FNU>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_to_block_scaled_static
+func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>) {
+  // CHECK: -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_to_block_scaled_unranked
+func.func @test_cast_to_block_scaled_unranked(%arg0: tensor<*xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>) {
+  // CHECK: -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>)
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_to_block_scaled_dynamic_scales
+func.func @test_cast_to_block_scaled_dynamic_scales(%arg0: tensor<4x?xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>) {
+  // CHECK: -> (tensor<4x?xf4E2M1FN>, tensor<4x?xf8E8M0FNU>)
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x?xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
index 926e7f2..f3d8dab 100644
--- a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
@@ -58,3 +58,27 @@ func.func @test_cast_f4e2m1(%arg0: tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xb
   %0 = tosa.cast %arg0 : (tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xbf16>
   return %0 : tensor<13x21x3xbf16>
 }
+
+// -----
+
+// CHECK-LABEL: test_cast_from_block_scaled_fp8e5m2_fp32
+func.func @test_cast_from_block_scaled_fp8e5m2_fp32(%arg0: tensor<4x32xf8E5M2>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf8E5M2>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_from_block_scaled_fp8e5m2_bf16
+func.func @test_cast_from_block_scaled_fp8e5m2_bf16(%arg0: tensor<4x32xf8E5M2>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xbf16> {
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf8E5M2>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xbf16>
+  return %0 : tensor<4x32xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_to_block_scaled_static
+func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>) {
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/verifier.mlir b/mlir/test/Dialect/Tosa/verifier.mlir
index 4be5d72..6cf76cd 100644
--- a/mlir/test/Dialect/Tosa/verifier.mlir
+++ b/mlir/test/Dialect/Tosa/verifier.mlir
@@ -1033,7 +1033,6 @@ module {
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_indices_N
 func.func @scatter_invalid_indices_N(%arg0 : tensor<2x4x5xi32>, %arg1 : tensor<3x2xi32>, %arg2 : tensor<2x2x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires indices dimension 0 to have size 2, got 3}}
   %1 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<2x4x5xi32>, tensor<3x2xi32>, tensor<2x2x5xi32>) -> tensor<2x4x5xi32>
@@ -1042,7 +1041,6 @@ func.func @scatter_invalid_indices_N(%arg0 : tensor<2x4x5xi32>, %arg1 : tensor<3
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_input_N
 func.func @scatter_invalid_input_N(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<2x2xi32>, %arg2 : tensor<3x2x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires input dimension 0 to have size 2, got 3}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<2x2xi32>, tensor<3x2x5xi32>) -> tensor<2x4x5xi32>
@@ -1051,7 +1049,6 @@ func.func @scatter_invalid_input_N(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<2x2
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_out_N
 func.func @scatter_invalid_out_N(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi32>, %arg2 : tensor<2x2x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires values_out dimension 0 to have size 2, got 3}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<?x2xi32>, tensor<2x2x5xi32>) -> tensor<3x4x5xi32>
@@ -1060,7 +1057,6 @@ func.func @scatter_invalid_out_N(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_out_K
 func.func @scatter_invalid_out_K(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi32>, %arg2 : tensor<2x2x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires values_out dimension 1 to have size 4, got 3}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<?x2xi32>, tensor<2x2x5xi32>) -> tensor<2x3x5xi32>
@@ -1069,7 +1065,6 @@ func.func @scatter_invalid_out_K(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_input_W
 func.func @scatter_invalid_input_W(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi32>, %arg2 : tensor<2x3x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires input dimension 1 to have size 2, got 3}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<?x2xi32>, tensor<2x3x5xi32>) -> tensor<2x4x5xi32>
@@ -1078,7 +1073,6 @@ func.func @scatter_invalid_input_W(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_input_C
 func.func @scatter_invalid_input_C(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi32>, %arg2 : tensor<2x2x6xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires input dimension 2 to have size 5, got 6}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<?x2xi32>, tensor<2x2x6xi32>) -> tensor<2x4x5xi32>
@@ -1087,7 +1081,6 @@ func.func @scatter_invalid_input_C(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_out_C
 func.func @scatter_invalid_out_C(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi32>, %arg2 : tensor<2x2x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires values_out dimension 2 to have size 5, got 6}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<?x2xi32>, tensor<2x2x5xi32>) -> tensor<2x4x6xi32>
@@ -1096,7 +1089,6 @@ func.func @scatter_invalid_out_C(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_K_W
 func.func @scatter_invalid_K_W(%arg0 : tensor<2x4x5xi32>, %arg1 : tensor<2x6xi32>, %arg2 : tensor<2x6x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires dimensions K >= W, got K=4 and W=6}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<2x4x5xi32>, tensor<2x6xi32>, tensor<2x6x5xi32>) -> tensor<2x4x5xi32>
@@ -1150,3 +1142,83 @@ func.func @test_matmul_t_block_scaled_batch_mismatch(%arg0: tensor<4x8x32xf8E4M3
   %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<2x16x32xf8E4M3FN>, tensor<2x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
   return %0 : tensor<4x8x16xf32>
 }
+
+// -----
+
+func.func @cast_from_block_scaled_incompatible_input_output_shape(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<5x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op require compatible shapes for input_data ('tensor<4x32xf4E2M1FN>') and output_data ('tensor<5x32xf32>')}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<5x32xf32>
+  return %0 : tensor<5x32xf32>
+}
+
+// -----
+
+func.func @cast_from_block_scaled_not_scalar(%arg0: tensor<f4E2M1FN>, %arg1: tensor<f8E8M0FNU>) -> tensor<f32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op operand #0 must be tosa-conformant tensor of at least rank 1, but got 'tensor<f4E2M1FN>'}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<f4E2M1FN>, tensor<f8E8M0FNU>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func.func @cast_from_block_scaled_not_divisible_by_block_size(%arg0: tensor<4x33xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x33xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op expect last dimension of input_data (33) to be divisible by block_size (32)}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x33xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<4x33xf32>
+  return %0 : tensor<4x33xf32>
+}
+
+// -----
+
+func.func @cast_from_block_scaled_data_scale_mismatch(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<5x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op require compatible shapes for input_data ('tensor<4x32xf4E2M1FN>') and input_scale ('tensor<5x1xf8E8M0FNU>') except for the last dimension}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<5x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+
+func.func @cast_from_block_scaled_data_scale_channel_mismatch(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x2xf8E8M0FNU>) -> tensor<4x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op expect last dimension of input_scale (2) to be equal to last dimension of input_data / block_size (1)}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf4E2M1FN>, tensor<4x2xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_incompatible_input_output_shape(%arg0: tensor<4x32xf32>) -> (tensor<5x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op require compatible shapes for input_data ('tensor<4x32xf32>') and output_data ('tensor<5x32xf4E2M1FN>')}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<5x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<5x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_not_scalar(%arg0: tensor<f32>) -> (tensor<f4E2M1FN>, tensor<f8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op operand #0 must be tosa-conformant tensor of at least rank 1, but got 'tensor<f32>'}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<f32>) -> (tensor<f4E2M1FN>, tensor<f8E8M0FNU>)
+  return %0#0, %0#1 : tensor<f4E2M1FN>, tensor<f8E8M0FNU>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_not_divisible_by_block_size(%arg0: tensor<4x33xf32>) -> (tensor<4x33xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op expect last dimension of input_data (33) to be divisible by block_size (32)}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x33xf32>) -> (tensor<4x33xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x33xf4E2M1FN>, tensor<4x1xf8E8M0FNU>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_data_scale_mismatch(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<5x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op require compatible shapes for output_data ('tensor<4x32xf4E2M1FN>') and output_scale ('tensor<5x1xf8E8M0FNU>') except for the last dimension}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<5x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf4E2M1FN>, tensor<5x1xf8E8M0FNU>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_data_scale_channel_mismatch(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x2xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op expect last dimension of output_scale (2) to be equal to last dimension of output_data / block_size (1)}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x2xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf4E2M1FN>, tensor<4x2xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index fe4f44c..7e742af 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -682,3 +682,73 @@ gpu.module @test_kernel {
     gpu.return
   }
 }
+
+// -----
+gpu.module @test_kernel {
+  // CHECK-LABEL: remove_unit_dim_inst_data
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x1x32xf32>
+  // CHECK: [[cst_0:%.+]] = arith.constant dense<true> : vector<16xi1>
+  // CHECK: [[cst_1:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
+  // CHECK: [[cst_2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
+  // CHECK: [[ld_0:%.+]] = xegpu.load [[arg0]][[[cst_1]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
+  // CHECK: [[ld_1:%.+]] = xegpu.load [[arg0]][[[cst_2]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
+  // CHECK: [[ins_0:%.+]] = vector.insert_strided_slice [[ld_0]], [[cst]] {offsets = [0, 0, 0], strides = [1]} : vector<16xf32> into vector<1x1x32xf32>
+  // CHECK: [[ins_1:%.+]] = vector.insert_strided_slice [[ld_1]], [[ins_0]] {offsets = [0, 0, 16], strides = [1]} : vector<16xf32> into vector<1x1x32xf32>
+  gpu.func @remove_unit_dim_inst_data(%src: ui64) -> vector<1x1x32xf32> {
+      %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<[[
+      [0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248]
+      ]]> : vector<1x1x32xindex>
+
+      %mask = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<true> : vector<1x1x32xi1>
+      %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
+
+      gpu.return %ld : vector<1x1x32xf32>
+  }
+}
+
+// -----
+#l = #xegpu.layout<inst_data = [1, 16]>
+gpu.module @test_kernel {
+  // CHECK-LABEL: load_store_nd_with_offsets
+  // CHECK-SAME: [[arg0:%.+]]: memref<1024x1024xf32>, [[arg1:%.+]]: memref<1024x1024xf32>, [[arg2:%.+]]: memref<1024x1024xf32>
+  // CHECK-DAG: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x32xf32>
+  // CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index
+  // CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
+  // CHECK: [[tdesc_a:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
+  // CHECK: [[tdesc_b:%.+]] = xegpu.create_nd_tdesc [[arg1]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
+  // CHECK: [[tdesc_c:%.+]] = xegpu.create_nd_tdesc [[arg2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
+  // CHECK: [[ld_a0:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c0]]]  : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
+  // CHECK: [[ld_a1:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c16]]]  : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
+  // CHECK: [[ld_b0:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c0]]]  : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
+  // CHECK: [[ld_b1:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c16]]]  : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
+  // CHECK: [[cast_a0:%.+]] = vector.shape_cast [[ld_a0]] : vector<1x16xf32> to vector<16xf32>
+  // CHECK: [[cast_b0:%.+]] = vector.shape_cast [[ld_b0]] : vector<1x16xf32> to vector<16xf32>
+  // CHECK: [[add0:%.+]] = arith.addf [[cast_a0]], [[cast_b0]] : vector<16xf32>
+  // CHECK: [[ins0:%.+]] = vector.insert_strided_slice [[add0]], [[cst]] {offsets = [0, 0], strides = [1]} : vector<16xf32> into vector<1x32xf32>
+  // CHECK: [[cast_a1:%.+]] = vector.shape_cast [[ld_a1]] : vector<1x16xf32> to vector<16xf32>
+  // CHECK: [[cast_b1:%.+]] = vector.shape_cast [[ld_b1]] : vector<1x16xf32> to vector<16xf32>
+  // CHECK: [[add1:%.+]] = arith.addf [[cast_a1]], [[cast_b1]] : vector<16xf32>
+  // CHECK: [[ins1:%.+]] = vector.insert_strided_slice [[add1]], [[ins0]] {offsets = [0, 16], strides = [1]} : vector<16xf32> into vector<1x32xf32>
+  // CHECK: [[ext0:%.+]] = vector.extract_strided_slice [[ins1]] {offsets = [0, 0], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32>
+  // CHECK: [[ext1:%.+]] = vector.extract_strided_slice [[ins1]] {offsets = [0, 16], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32>
+  // CHECK: xegpu.store_nd [[ext0]], [[tdesc_c]][[[c0]], [[c0]]]  : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32>
+  // CHECK: xegpu.store_nd [[ext1]], [[tdesc_c]][[[c0]], [[c16]]]  : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32>
+  gpu.func @load_store_nd_with_offsets(%A: memref<1024x1024xf32>, %B: memref<1024x1024xf32>, %C: memref<1024x1024xf32>) {
+    %c0 = arith.constant 0 : index
+
+    %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l>
+    %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l>
+    %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l>
+
+    %a = xegpu.load_nd %a_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
+    %b = xegpu.load_nd %b_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
+
+    %result = arith.addf %a, %b {layout_result_0 = #l} : vector<1x32xf32>
+    xegpu.store_nd %result, %c_tdesc[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l>
+    gpu.return
+  }
+}
diff --git a/offload/include/OffloadPolicy.h b/offload/include/OffloadPolicy.h
index 800fefb..9bbd1ff 100644
--- a/offload/include/OffloadPolicy.h
+++ b/offload/include/OffloadPolicy.h
@@ -22,8 +22,6 @@ enum kmp_target_offload_kind_t {
   tgt_mandatory = 2
 };
 
-extern "C" int __kmpc_get_target_offload(void) __attribute__((weak));
-
 class OffloadPolicy {
 
   OffloadPolicy(PluginManager &PM) {
diff --git a/offload/include/OpenMP/InternalTypes.h b/offload/include/OpenMP/InternalTypes.h
index bd84c38..5460c2a 100644
--- a/offload/include/OpenMP/InternalTypes.h
+++ b/offload/include/OpenMP/InternalTypes.h
@@ -70,10 +70,23 @@ typedef struct kmp_task {
   int32_t part_id;
 } kmp_task_t;
 
-int32_t __kmpc_global_thread_num(void *) __attribute__((weak));
-bool __kmpc_omp_has_task_team(int32_t gtid) __attribute__((weak));
-void **__kmpc_omp_get_target_async_handle_ptr(int32_t gtid)
-    __attribute__((weak));
+// Implemented in libomp, they are called from within __tgt_* functions.
+int32_t __kmpc_global_thread_num(void *);
+bool __kmpc_omp_has_task_team(int32_t gtid);
+void **__kmpc_omp_get_target_async_handle_ptr(int32_t gtid);
+int __kmpc_get_target_offload(void);
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, int32_t gtid, int32_t flags,
+                             size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                             kmp_routine_entry_t task_entry, int64_t device_id);
+int32_t __kmpc_omp_task_with_deps(ident_t *loc_ref, int32_t gtid,
+                                  kmp_task_t *new_task, int32_t ndeps,
+                                  kmp_depend_info_t *dep_list,
+                                  int32_t ndeps_noalias,
+                                  kmp_depend_info_t *noalias_dep_list);
+void __kmpc_omp_wait_deps(ident_t *loc_ref, int32_t gtid, int32_t ndeps,
+                          kmp_depend_info_t *dep_list, int32_t ndeps_noalias,
+                          kmp_depend_info_t *noalias_dep_list);
 
 /**
  * The argument set that is passed from asynchronous memory copy to block
diff --git a/offload/include/OpenMP/omp.h b/offload/include/OpenMP/omp.h
index 49d9f1f..768ca46 100644
--- a/offload/include/OpenMP/omp.h
+++ b/offload/include/OpenMP/omp.h
@@ -40,7 +40,7 @@ typedef void *omp_depend_t;
 /// API declarations
 ///{
 
-int omp_get_default_device(void) __attribute__((weak));
+int omp_get_default_device(void);
 
 ///}
 
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index 48b086d..dd83a3c 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -48,29 +48,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
                          const char *Name);
 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name);
 
-// Implemented in libomp, they are called from within __tgt_* functions.
-extern "C" {
-int __kmpc_get_target_offload(void) __attribute__((weak));
-kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, int32_t gtid, int32_t flags,
-                                  size_t sizeof_kmp_task_t,
-                                  size_t sizeof_shareds,
-                                  kmp_routine_entry_t task_entry)
-    __attribute__((weak));
-
-kmp_task_t *
-__kmpc_omp_target_task_alloc(ident_t *loc_ref, int32_t gtid, int32_t flags,
-                             size_t sizeof_kmp_task_t, size_t sizeof_shareds,
-                             kmp_routine_entry_t task_entry, int64_t device_id)
-    __attribute__((weak));
-
-int32_t __kmpc_omp_task_with_deps(ident_t *loc_ref, int32_t gtid,
-                                  kmp_task_t *new_task, int32_t ndeps,
-                                  kmp_depend_info_t *dep_list,
-                                  int32_t ndeps_noalias,
-                                  kmp_depend_info_t *noalias_dep_list)
-    __attribute__((weak));
-}
-
 EXTERN int omp_get_num_devices(void) {
   TIMESCOPE();
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
diff --git a/offload/libomptarget/OpenMP/InteropAPI.cpp b/offload/libomptarget/OpenMP/InteropAPI.cpp
index d6ef17c..b307011 100644
--- a/offload/libomptarget/OpenMP/InteropAPI.cpp
+++ b/offload/libomptarget/OpenMP/InteropAPI.cpp
@@ -18,14 +18,6 @@
 #include <cstdlib>
 #include <cstring>
 
-extern "C" {
-
-void __kmpc_omp_wait_deps(ident_t *loc_ref, int32_t gtid, int32_t ndeps,
-                          kmp_depend_info_t *dep_list, int32_t ndeps_noalias,
-                          kmp_depend_info_t *noalias_dep_list);
-
-} // extern "C"
-
 namespace {
 omp_interop_rc_t getPropertyErrorType(omp_interop_property_t Property) {
   switch (Property) {
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index b892ac9..7d62afc 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -979,7 +979,7 @@ cc_binary(
 )
 
 gentbl_cc_library(
-    name = "lldb_server_llgs_opts_gen",
+    name = "lldb_server_opts_gen",
     strip_include_prefix = ".",
     tbl_outs = {"LLGSOptions.inc": ["-gen-opt-parser-defs"]},
     tblgen = "//llvm:llvm-tblgen",
@@ -987,15 +987,6 @@ gentbl_cc_library(
     deps = ["//llvm:OptParserTdFiles"],
 )
 
-gentbl_cc_library(
-    name = "lldb_server_platform_opts_gen",
-    strip_include_prefix = ".",
-    tbl_outs = {"PlatformOptions.inc": ["-gen-opt-parser-defs"]},
-    tblgen = "//llvm:llvm-tblgen",
-    td_file = "tools/lldb-server/PlatformOptions.td",
-    deps = ["//llvm:OptParserTdFiles"],
-)
-
 cc_binary(
     name = "lldb-server",
     srcs = glob([
@@ -1014,8 +1005,7 @@ cc_binary(
         ":Interpreter",
         ":Utility",
         ":Version",
-        ":lldb_server_llgs_opts_gen",
-        ":lldb_server_platform_opts_gen",
+        ":lldb_server_opts_gen",
         "//lldb:Target",
         "//lldb:TargetHeaders",
         "//lldb/source/Plugins:PluginCPlusPlusLanguage",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 63d4aea..bcee6c4 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -577,6 +577,7 @@ cc_library(
     deps = [
         ":BinaryFormat",
         ":DebugInfoCodeView",
+        ":DebugInfoDWARFLowLevel",
         ":Support",
         ":TargetParser",
         ":config",