[𝘀𝗽𝗿] changes introduced through rebaseusers/shawbyoung/spr/main.bolt-match-blocks-with-pseudo-probes

Created using spr 1.3.4 [skip ci]
author: Amir Ayupov <aaupov@fb.com> 2024-11-08 15:25:36 -0800
committer: Amir Ayupov <aaupov@fb.com> 2024-11-08 15:25:36 -0800
commit: 2502434bf541db29b813c355f1b679bd07f0def5 (patch)
tree: 49c4a44014dd132e062e4ee5836857d70a0690be
parent: c84de4215823f46498a049cca8adcbb6ca6e0f83 (diff)
parent: d936924f5e22e8efbc27873f62e8dfc6e410fcf9 (diff)
download: llvm-users/shawbyoung/spr/main.bolt-match-blocks-with-pseudo-probes.zip
llvm-users/shawbyoung/spr/main.bolt-match-blocks-with-pseudo-probes.tar.gz
llvm-users/shawbyoung/spr/main.bolt-match-blocks-with-pseudo-probes.tar.bz2
226 files changed, 4995 insertions, 3673 deletions
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 018b643..24f8704 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -141,7 +141,7 @@
 /clang/tools/clang-installapi/ @cyndyishida
 
 # ExtractAPI
-/clang/**/ExtractAPI @daniel-grumberg
+/clang/**/ExtractAPI @daniel-grumberg @QuietMisdreavus
 
 # DWARFLinker, dwarfutil, dsymutil
 /llvm/**/DWARFLinker/ @JDevlieghere
diff --git a/clang/docs/AMDGPUSupport.rst b/clang/docs/AMDGPUSupport.rst
index 3eada5f..e63c0e1 100644
--- a/clang/docs/AMDGPUSupport.rst
+++ b/clang/docs/AMDGPUSupport.rst
@@ -50,9 +50,9 @@ Predefined Macros
    * - ``__AMDGCN_UNSAFE_FP_ATOMICS__``
      - Defined if unsafe floating-point atomics are allowed.
    * - ``__AMDGCN_WAVEFRONT_SIZE__``
-     - Defines the wavefront size. Allowed values are 32 and 64 (deprecated).
+     - Defines the wavefront size. Allowed values are 32 and 64.
    * - ``__AMDGCN_WAVEFRONT_SIZE``
-     - Alias to ``__AMDGCN_WAVEFRONT_SIZE__`` (deprecated).
+     - Alias to ``__AMDGCN_WAVEFRONT_SIZE__``. To be deprecated.
    * - ``__HAS_FMAF__``
      - Defined if FMAF instruction is available (deprecated).
    * - ``__HAS_LDEXPF__``
diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index e830acd..e26297c 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -178,7 +178,7 @@ Predefined Macros
 
 Note that some architecture specific AMDGPU macros will have default values when
 used from the HIP host compilation. Other :doc:`AMDGPU macros <AMDGPUSupport>`
-like ``__AMDGCN_WAVEFRONT_SIZE__`` (deprecated) will default to 64 for example.
+like ``__AMDGCN_WAVEFRONT_SIZE__`` will default to 64 for example.
 
 Compilation Modes
 =================
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index f728535..0998e6b 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5974,3 +5974,31 @@ Clang guarantees the following behaviors:
   padding bits are initialized to zero.
 
 Currently, the above extension only applies to C source code, not C++.
+
+
+Empty Objects in C
+==================
+The declaration of a structure or union type which has no named members is
+undefined behavior (C23 and earlier) or implementation-defined behavior (C2y).
+Clang allows the declaration of a structure or union type with no named members
+in all C language modes. `sizeof` for such a type returns `0`, which is
+different behavior than in C++ (where the size of such an object is typically
+`1`).
+
+
+Qualified function types in C
+=============================
+Declaring a function with a qualified type in C is undefined behavior (C23 and
+earlier) or implementation-defined behavior (C2y). Clang allows a function type
+to be specified with the ``const`` and ``volatile`` qualifiers, but ignores the
+qualifications.
+
+.. code-block:: c
+
+   typedef int f(void);
+   const volatile f func; // Qualifier on function type has no effect.
+
+
+Note, Clang does not allow an ``_Atomic`` function type because
+of explicit constraints against atomically qualified (arrays and) function
+types.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d0c43ff..c3424e0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -298,6 +298,22 @@ C2y Feature Support
   paper adopts Clang's existing practice, so there were no changes to compiler
   behavior.
 
+- Implemented support for `N3341 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3341.pdf>`_
+  which makes empty structure and union objects implementation-defined in C.
+  ``-Wgnu-empty-struct`` will be emitted in C23 and earlier modes because the
+  behavior is a conforming GNU extension in those modes, but will no longer
+  have an effect in C2y mode.
+
+- Updated conformance for `N3342 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3342.pdf>`_
+  which made qualified function types implementation-defined rather than
+  undefined. Clang has always accepted ``const`` and ``volatile`` qualified
+  function types by ignoring the qualifiers.
+
+- Updated conformance for `N3346 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3346.pdf>`_
+  which changes some undefined behavior around initialization to instead be
+  constraint violations. This paper adopts Clang's existing practice, so there
+  were no changes to compiler behavior.
+
 C23 Feature Support
 ^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index b3aea09..3eb50f9 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -114,7 +114,6 @@ public:
   }
 };
 
-class OpenACCLoopConstruct;
 /// This class represents a compute construct, representing a 'Kind' of
 /// `parallel', 'serial', or 'kernel'. These constructs are associated with a
 /// 'structured block', defined as:
@@ -183,8 +182,7 @@ public:
   static OpenACCComputeConstruct *
   Create(const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc,
          SourceLocation DirectiveLoc, SourceLocation EndLoc,
-         ArrayRef<const OpenACCClause *> Clauses, Stmt *StructuredBlock,
-         ArrayRef<OpenACCLoopConstruct *> AssociatedLoopConstructs);
+         ArrayRef<const OpenACCClause *> Clauses, Stmt *StructuredBlock);
 
   Stmt *getStructuredBlock() { return getAssociatedStmt(); }
   const Stmt *getStructuredBlock() const {
@@ -198,12 +196,10 @@ class OpenACCLoopConstruct final
     : public OpenACCAssociatedStmtConstruct,
       public llvm::TrailingObjects<OpenACCLoopConstruct,
                                    const OpenACCClause *> {
-  // The compute construct this loop is associated with, or nullptr if this is
-  // an orphaned loop construct, or if it hasn't been set yet.  Because we
-  // construct the directives at the end of their statement, the 'parent'
-  // construct is not yet available at the time of construction, so this needs
-  // to be set 'later'.
-  const OpenACCComputeConstruct *ParentComputeConstruct = nullptr;
+  // The compute/combined construct kind this loop is associated with, or
+  // invalid if this is an orphaned loop construct.
+  OpenACCDirectiveKind ParentComputeConstructKind =
+      OpenACCDirectiveKind::Invalid;
 
   friend class ASTStmtWriter;
   friend class ASTStmtReader;
@@ -212,15 +208,9 @@ class OpenACCLoopConstruct final
 
   OpenACCLoopConstruct(unsigned NumClauses);
 
-  OpenACCLoopConstruct(SourceLocation Start, SourceLocation DirLoc,
-                       SourceLocation End,
+  OpenACCLoopConstruct(OpenACCDirectiveKind ParentKind, SourceLocation Start,
+                       SourceLocation DirLoc, SourceLocation End,
                        ArrayRef<const OpenACCClause *> Clauses, Stmt *Loop);
-  void setLoop(Stmt *Loop);
-
-  void setParentComputeConstruct(OpenACCComputeConstruct *CC) {
-    assert(!ParentComputeConstruct && "Parent already set?");
-    ParentComputeConstruct = CC;
-  }
 
 public:
   static bool classof(const Stmt *T) {
@@ -231,9 +221,9 @@ public:
                                            unsigned NumClauses);
 
   static OpenACCLoopConstruct *
-  Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation DirLoc,
-         SourceLocation EndLoc, ArrayRef<const OpenACCClause *> Clauses,
-         Stmt *Loop);
+  Create(const ASTContext &C, OpenACCDirectiveKind ParentKind,
+         SourceLocation BeginLoc, SourceLocation DirLoc, SourceLocation EndLoc,
+         ArrayRef<const OpenACCClause *> Clauses, Stmt *Loop);
 
   Stmt *getLoop() { return getAssociatedStmt(); }
   const Stmt *getLoop() const {
@@ -246,10 +236,11 @@ public:
   /// loop construct is the nearest compute construct that lexically contains
   /// the loop construct.
   bool isOrphanedLoopConstruct() const {
-    return ParentComputeConstruct == nullptr;
+    return ParentComputeConstructKind == OpenACCDirectiveKind::Invalid;
   }
-  const OpenACCComputeConstruct *getParentComputeConstruct() const {
-    return ParentComputeConstruct;
+
+  OpenACCDirectiveKind getParentComputeConstructKind() const {
+    return ParentComputeConstructKind;
   }
 };
 } // namespace clang
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 23c8eb2..b64dbef 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3858,8 +3858,7 @@ def LifetimeBoundDocs : Documentation {
 The ``lifetimebound`` attribute on a function parameter or implicit object
 parameter indicates that objects that are referred to by that parameter may
 also be referred to by the return value of the annotated function (or, for a
-parameter of a constructor, by the value of the constructed object). It is only
-supported in C++.
+parameter of a constructor, by the value of the constructed object).
 
 By default, a reference is considered to refer to its referenced object, a
 pointer is considered to refer to its pointee, a ``std::initializer_list<T>``
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 2a6ccac..a5d97d7 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6241,8 +6241,9 @@ def err_typecheck_negative_array_size : Error<"array size is negative">;
 def warn_typecheck_function_qualifiers_ignored : Warning<
   "'%0' qualifier on function type %1 has no effect">,
   InGroup<IgnoredQualifiers>;
-def warn_typecheck_function_qualifiers_unspecified : Warning<
-  "'%0' qualifier on function type %1 has unspecified behavior">;
+def ext_typecheck_function_qualifiers_unspecified : ExtWarn<
+  "'%0' qualifier on function type %1 has no effect and is a Clang extension">,
+  InGroup<IgnoredQualifiers>;
 def warn_typecheck_reference_qualifiers : Warning<
   "'%0' qualifier on reference type %1 has no effect">,
   InGroup<IgnoredReferenceQualifiers>;
diff --git a/clang/include/clang/Basic/MacroBuilder.h b/clang/include/clang/Basic/MacroBuilder.h
index d83f27c..96e67cb 100644
--- a/clang/include/clang/Basic/MacroBuilder.h
+++ b/clang/include/clang/Basic/MacroBuilder.h
@@ -26,14 +26,8 @@ public:
   MacroBuilder(raw_ostream &Output) : Out(Output) {}
 
   /// Append a \#define line for macro of the form "\#define Name Value\n".
-  /// If DeprecationMsg is provided, also append a pragma to deprecate the
-  /// defined macro.
-  void defineMacro(const Twine &Name, const Twine &Value = "1",
-                   Twine DeprecationMsg = "") {
+  void defineMacro(const Twine &Name, const Twine &Value = "1") {
     Out << "#define " << Name << ' ' << Value << '\n';
-    if (!DeprecationMsg.isTriviallyEmpty())
-      Out << "#pragma clang deprecated(" << Name << ", \"" << DeprecationMsg
-          << "\")\n";
   }
 
   /// Append a \#undef line for Name.  Name should be of the form XXX
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index b4bc4b7..90b1ec2 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1962,7 +1962,7 @@ let SVETargetGuard = "sve2,lut,bf16", SMETargetGuard = "sme2,lut,bf16" in {
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Optional
 
-let SVETargetGuard = "sve2,sve-aes", SMETargetGuard = InvalidMode in {
+let SVETargetGuard = "sve2-aes", SMETargetGuard = InvalidMode in {
 def SVAESD   : SInst<"svaesd[_{d}]",   "ddd", "Uc", MergeNone, "aarch64_sve_aesd", [IsOverloadNone]>;
 def SVAESIMC : SInst<"svaesimc[_{d}]", "dd",  "Uc", MergeNone, "aarch64_sve_aesimc", [IsOverloadNone]>;
 def SVAESE   : SInst<"svaese[_{d}]",   "ddd", "Uc", MergeNone, "aarch64_sve_aese", [IsOverloadNone]>;
diff --git a/clang/include/clang/CIR/CIRGenerator.h b/clang/include/clang/CIR/CIRGenerator.h
index aa1a7e6..c8ca7e4 100644
--- a/clang/include/clang/CIR/CIRGenerator.h
+++ b/clang/include/clang/CIR/CIRGenerator.h
@@ -25,14 +25,15 @@
 namespace clang {
 class DeclGroupRef;
 class DiagnosticsEngine;
+namespace CIRGen {
+class CIRGenModule;
+} // namespace CIRGen
 } // namespace clang
 
 namespace mlir {
 class MLIRContext;
 } // namespace mlir
 namespace cir {
-class CIRGenModule;
-
 class CIRGenerator : public clang::ASTConsumer {
   virtual void anchor();
   clang::DiagnosticsEngine &diags;
@@ -44,7 +45,7 @@ class CIRGenerator : public clang::ASTConsumer {
 
 protected:
   std::unique_ptr<mlir::MLIRContext> mlirCtx;
-  std::unique_ptr<CIRGenModule> cgm;
+  std::unique_ptr<clang::CIRGen::CIRGenModule> cgm;
 
 public:
   CIRGenerator(clang::DiagnosticsEngine &diags,
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
index 69d6e97..305a064 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
@@ -22,7 +22,7 @@ def CIR_Dialect : Dialect {
   let summary = "A high-level dialect for analyzing and optimizing Clang "
                 "supported languages";
 
-  let cppNamespace = "::mlir::cir";
+  let cppNamespace = "::cir";
 
   let useDefaultAttributePrinterParser = 0;
   let useDefaultTypePrinterParser = 0;
@@ -31,13 +31,15 @@ def CIR_Dialect : Dialect {
     void registerAttributes();
     void registerTypes();
 
-    Type parseType(DialectAsmParser &parser) const override;
-    void printType(Type type, DialectAsmPrinter &printer) const override;
+    mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+    void printType(mlir::Type type,
+                   mlir::DialectAsmPrinter &printer) const override;
 
-    Attribute parseAttribute(DialectAsmParser &parser,
-                             Type type) const override;
+    mlir::Attribute parseAttribute(mlir::DialectAsmParser &parser,
+                                   mlir::Type type) const override;
 
-    void printAttribute(Attribute attr, DialectAsmPrinter &os) const override;
+    void printAttribute(mlir::Attribute attr,
+                        mlir::DialectAsmPrinter &os) const override;
   }];
 }
 
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index c0440fa..4462eb6 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -51,12 +51,12 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 // following:
 //
 //   class CIRFooOpLowering
-//       : public mlir::OpConversionPattern<mlir::cir::FooOp> {
+//       : public mlir::OpConversionPattern<cir::FooOp> {
 //   public:
-//     using OpConversionPattern<mlir::cir::FooOp>::OpConversionPattern;
+//     using OpConversionPattern<cir::FooOp>::OpConversionPattern;
 //
 //     mlir::LogicalResult matchAndRewrite(
-//         mlir::cir::FooOp op,
+//         cir::FooOp op,
 //         OpAdaptor adaptor,
 //         mlir::ConversionPatternRewriter &rewriter) const override {
 //       rewriter.replaceOpWithNewOp<mlir::LLVM::BarOp>(
@@ -92,7 +92,7 @@ def FuncOp : CIR_Op<"func"> {
 
   let skipDefaultBuilders = 1;
 
-  let builders = [OpBuilder<(ins "StringRef":$name)>];
+  let builders = [OpBuilder<(ins "llvm::StringRef":$name)>];
 
   let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h
index 53a4a69..1c6ac4b 100644
--- a/clang/include/clang/Sema/SemaOpenACC.h
+++ b/clang/include/clang/Sema/SemaOpenACC.h
@@ -34,11 +34,6 @@ class OpenACCClause;
 
 class SemaOpenACC : public SemaBase {
 private:
-  /// A collection of loop constructs in the compute construct scope that
-  /// haven't had their 'parent' compute construct set yet. Entires will only be
-  /// made to this list in the case where we know the loop isn't an orphan.
-  llvm::SmallVector<OpenACCLoopConstruct *> ParentlessLoopConstructs;
-
   struct ComputeConstructInfo {
     /// Which type of compute construct we are inside of, which we can use to
     /// determine whether we should add loops to the above collection.  We can
@@ -768,7 +763,6 @@ public:
     SourceLocation OldLoopWorkerClauseLoc;
     SourceLocation OldLoopVectorClauseLoc;
     SourceLocation OldLoopWithoutSeqLoc;
-    llvm::SmallVector<OpenACCLoopConstruct *> ParentlessLoopConstructs;
     llvm::SmallVector<OpenACCReductionClause *> ActiveReductionClauses;
     LoopInConstructRAII LoopRAII;
 
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 047f354..f33d2fb 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -1738,13 +1738,12 @@ void NamedDecl::printNestedNameSpecifier(raw_ostream &OS,
 
     // Suppress inline namespace if it doesn't make the result ambiguous.
     if (Ctx->isInlineNamespace() && NameInScope) {
-      bool isRedundant =
-          cast<NamespaceDecl>(Ctx)->isRedundantInlineQualifierFor(NameInScope);
       if (P.SuppressInlineNamespace ==
               PrintingPolicy::SuppressInlineNamespaceMode::All ||
           (P.SuppressInlineNamespace ==
                PrintingPolicy::SuppressInlineNamespaceMode::Redundant &&
-           isRedundant)) {
+           cast<NamespaceDecl>(Ctx)->isRedundantInlineQualifierFor(
+               NameInScope))) {
         continue;
       }
     }
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
index 2d864a2..8212553 100644
--- a/clang/lib/AST/StmtOpenACC.cpp
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -28,44 +28,15 @@ OpenACCComputeConstruct::CreateEmpty(const ASTContext &C, unsigned NumClauses) {
 OpenACCComputeConstruct *OpenACCComputeConstruct::Create(
     const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc,
     SourceLocation DirLoc, SourceLocation EndLoc,
-    ArrayRef<const OpenACCClause *> Clauses, Stmt *StructuredBlock,
-    ArrayRef<OpenACCLoopConstruct *> AssociatedLoopConstructs) {
+    ArrayRef<const OpenACCClause *> Clauses, Stmt *StructuredBlock) {
   void *Mem = C.Allocate(
       OpenACCComputeConstruct::totalSizeToAlloc<const OpenACCClause *>(
           Clauses.size()));
   auto *Inst = new (Mem) OpenACCComputeConstruct(K, BeginLoc, DirLoc, EndLoc,
                                                  Clauses, StructuredBlock);
-
-  llvm::for_each(AssociatedLoopConstructs, [&](OpenACCLoopConstruct *C) {
-    C->setParentComputeConstruct(Inst);
-  });
-
   return Inst;
 }
 
-void OpenACCComputeConstruct::findAndSetChildLoops() {
-  struct LoopConstructFinder : RecursiveASTVisitor<LoopConstructFinder> {
-    OpenACCComputeConstruct *Construct = nullptr;
-
-    LoopConstructFinder(OpenACCComputeConstruct *Construct)
-        : Construct(Construct) {}
-
-    bool TraverseOpenACCComputeConstruct(OpenACCComputeConstruct *C) {
-      // Stop searching if we find a compute construct.
-      return true;
-    }
-    bool TraverseOpenACCLoopConstruct(OpenACCLoopConstruct *C) {
-      // Stop searching if we find a loop construct, after taking ownership of
-      // it.
-      C->setParentComputeConstruct(Construct);
-      return true;
-    }
-  };
-
-  LoopConstructFinder f(this);
-  f.TraverseStmt(getAssociatedStmt());
-}
-
 OpenACCLoopConstruct::OpenACCLoopConstruct(unsigned NumClauses)
     : OpenACCAssociatedStmtConstruct(
           OpenACCLoopConstructClass, OpenACCDirectiveKind::Loop,
@@ -79,11 +50,13 @@ OpenACCLoopConstruct::OpenACCLoopConstruct(unsigned NumClauses)
 }
 
 OpenACCLoopConstruct::OpenACCLoopConstruct(
-    SourceLocation Start, SourceLocation DirLoc, SourceLocation End,
+    OpenACCDirectiveKind ParentKind, SourceLocation Start,
+    SourceLocation DirLoc, SourceLocation End,
     ArrayRef<const OpenACCClause *> Clauses, Stmt *Loop)
     : OpenACCAssociatedStmtConstruct(OpenACCLoopConstructClass,
                                      OpenACCDirectiveKind::Loop, Start, DirLoc,
-                                     End, Loop) {
+                                     End, Loop),
+      ParentComputeConstructKind(ParentKind) {
   // accept 'nullptr' for the loop. This is diagnosed somewhere, but this gives
   // us some level of AST fidelity in the error case.
   assert((Loop == nullptr || isa<ForStmt, CXXForRangeStmt>(Loop)) &&
@@ -96,12 +69,6 @@ OpenACCLoopConstruct::OpenACCLoopConstruct(
                                 Clauses.size()));
 }
 
-void OpenACCLoopConstruct::setLoop(Stmt *Loop) {
-  assert((isa<ForStmt, CXXForRangeStmt>(Loop)) &&
-         "Associated Loop not a for loop?");
-  setAssociatedStmt(Loop);
-}
-
 OpenACCLoopConstruct *OpenACCLoopConstruct::CreateEmpty(const ASTContext &C,
                                                         unsigned NumClauses) {
   void *Mem =
@@ -111,15 +78,14 @@ OpenACCLoopConstruct *OpenACCLoopConstruct::CreateEmpty(const ASTContext &C,
   return Inst;
 }
 
-OpenACCLoopConstruct *
-OpenACCLoopConstruct::Create(const ASTContext &C, SourceLocation BeginLoc,
-                             SourceLocation DirLoc, SourceLocation EndLoc,
-                             ArrayRef<const OpenACCClause *> Clauses,
-                             Stmt *Loop) {
+OpenACCLoopConstruct *OpenACCLoopConstruct::Create(
+    const ASTContext &C, OpenACCDirectiveKind ParentKind,
+    SourceLocation BeginLoc, SourceLocation DirLoc, SourceLocation EndLoc,
+    ArrayRef<const OpenACCClause *> Clauses, Stmt *Loop) {
   void *Mem =
       C.Allocate(OpenACCLoopConstruct::totalSizeToAlloc<const OpenACCClause *>(
           Clauses.size()));
-  auto *Inst =
-      new (Mem) OpenACCLoopConstruct(BeginLoc, DirLoc, EndLoc, Clauses, Loop);
+  auto *Inst = new (Mem)
+      OpenACCLoopConstruct(ParentKind, BeginLoc, DirLoc, EndLoc, Clauses, Loop);
   return Inst;
 }
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 01cfb1f..b54c166 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2928,7 +2928,7 @@ void TextNodeDumper::VisitOpenACCLoopConstruct(const OpenACCLoopConstruct *S) {
   if (S->isOrphanedLoopConstruct())
     OS << " <orphan>";
   else
-    OS << " parent: " << S->getParentComputeConstruct();
+    OS << " parent: " << S->getParentComputeConstructKind();
 }
 
 void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) {
diff --git a/clang/lib/Basic/Attributes.cpp b/clang/lib/Basic/Attributes.cpp
index 2d18fb3..6904bce 100644
--- a/clang/lib/Basic/Attributes.cpp
+++ b/clang/lib/Basic/Attributes.cpp
@@ -18,6 +18,7 @@
 #include "clang/Basic/TargetInfo.h"
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSwitch.h"
 
 using namespace clang;
 
@@ -155,26 +156,17 @@ std::string AttributeCommonInfo::getNormalizedFullName() const {
       normalizeName(getAttrName(), getScopeName(), getSyntax()));
 }
 
-// Sorted list of attribute scope names
-static constexpr std::pair<StringRef, AttributeCommonInfo::Scope> ScopeList[] =
-    {{"", AttributeCommonInfo::Scope::NONE},
-     {"clang", AttributeCommonInfo::Scope::CLANG},
-     {"gnu", AttributeCommonInfo::Scope::GNU},
-     {"gsl", AttributeCommonInfo::Scope::GSL},
-     {"hlsl", AttributeCommonInfo::Scope::HLSL},
-     {"msvc", AttributeCommonInfo::Scope::MSVC},
-     {"omp", AttributeCommonInfo::Scope::OMP},
-     {"riscv", AttributeCommonInfo::Scope::RISCV}};
-
 AttributeCommonInfo::Scope
 getScopeFromNormalizedScopeName(StringRef ScopeName) {
-  auto It = std::lower_bound(
-      std::begin(ScopeList), std::end(ScopeList), ScopeName,
-      [](const std::pair<StringRef, AttributeCommonInfo::Scope> &Element,
-         StringRef Value) { return Element.first < Value; });
-  assert(It != std::end(ScopeList) && It->first == ScopeName);
-
-  return It->second;
+  return llvm::StringSwitch<AttributeCommonInfo::Scope>(ScopeName)
+      .Case("", AttributeCommonInfo::Scope::NONE)
+      .Case("clang", AttributeCommonInfo::Scope::CLANG)
+      .Case("gnu", AttributeCommonInfo::Scope::GNU)
+      .Case("gsl", AttributeCommonInfo::Scope::GSL)
+      .Case("hlsl", AttributeCommonInfo::Scope::HLSL)
+      .Case("msvc", AttributeCommonInfo::Scope::MSVC)
+      .Case("omp", AttributeCommonInfo::Scope::OMP)
+      .Case("riscv", AttributeCommonInfo::Scope::RISCV);
 }
 
 unsigned AttributeCommonInfo::calculateAttributeSpellingListIndex() const {
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 3fd4337..e35ee2b 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -473,7 +473,7 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasSVE2p1)
     Builder.defineMacro("__ARM_FEATURE_SVE2p1", "1");
 
-  if (HasSVE2 && HasSVEAES)
+  if (HasSVE2 && HasSVE2AES)
     Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1");
 
   if (HasSVE2 && HasSVE2BitPerm)
@@ -769,7 +769,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("f32mm", FPU & SveMode && HasMatmulFP32)
       .Case("f64mm", FPU & SveMode && HasMatmulFP64)
       .Case("sve2", FPU & SveMode && HasSVE2)
-      .Case("sve2-pmull128", FPU & SveMode && HasSVEAES && HasSVE2)
+      .Case("sve2-pmull128", FPU & SveMode && HasSVE2AES)
       .Case("sve2-bitperm", FPU & SveMode && HasSVE2BitPerm)
       .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3)
       .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4)
@@ -861,10 +861,12 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasSVE2 = true;
       HasSVE2p1 = true;
     }
-    if (Feature == "+sve-aes") {
+    if (Feature == "+sve2-aes") {
       FPU |= NeonMode;
-      HasAES = true;
-      HasSVEAES = true;
+      FPU |= SveMode;
+      HasFullFP16 = true;
+      HasSVE2 = true;
+      HasSVE2AES = true;
     }
     if (Feature == "+sve2-sha3") {
       FPU |= NeonMode;
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 4c25bdb..ea3e401 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -78,7 +78,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasBFloat16 = false;
   bool HasSVE2 = false;
   bool HasSVE2p1 = false;
-  bool HasSVEAES = false;
+  bool HasSVE2AES = false;
   bool HasSVE2SHA3 = false;
   bool HasSVE2SM4 = false;
   bool HasSVEB16B16 = false;
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 99f8f29..0788191 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -337,12 +337,9 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts,
   if (hasFastFMA())
     Builder.defineMacro("FP_FAST_FMA");
 
-  Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE__", Twine(WavefrontSize),
-                      "compile-time-constant access to the wavefront size will "
-                      "be removed in a future release");
-  Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE", Twine(WavefrontSize),
-                      "compile-time-constant access to the wavefront size will "
-                      "be removed in a future release");
+  Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE__", Twine(WavefrontSize));
+  // ToDo: deprecate this macro for naming consistency.
+  Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE", Twine(WavefrontSize));
   Builder.defineMacro("__AMDGCN_CUMODE__", Twine(CUMode));
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index c1adc7e..4e8a8cc 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -22,7 +22,9 @@
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 
-using namespace cir;
+using namespace clang;
+using namespace clang::CIRGen;
+
 CIRGenModule::CIRGenModule(mlir::MLIRContext &context,
                            clang::ASTContext &astctx,
                            const clang::CodeGenOptions &cgo,
@@ -75,7 +77,7 @@ void CIRGenModule::buildGlobal(clang::GlobalDecl gd) {
 void CIRGenModule::buildGlobalFunctionDefinition(clang::GlobalDecl gd,
                                                  mlir::Operation *op) {
   auto const *funcDecl = cast<FunctionDecl>(gd.getDecl());
-  auto funcOp = builder.create<mlir::cir::FuncOp>(
+  auto funcOp = builder.create<cir::FuncOp>(
       getLoc(funcDecl->getSourceRange()), funcDecl->getIdentifier()->getName());
   theModule.push_back(funcOp);
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 2bf6a5d..9e5950f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -31,10 +31,8 @@ class LangOptions;
 class SourceLocation;
 class SourceRange;
 class TargetInfo;
-} // namespace clang
 
-using namespace clang;
-namespace cir {
+namespace CIRGen {
 
 /// This class organizes the cross-function state that is used while generating
 /// CIR code.
@@ -91,6 +89,8 @@ public:
   DiagnosticBuilder errorNYI(SourceRange, llvm::StringRef);
   DiagnosticBuilder errorNYI(SourceRange, llvm::StringRef, llvm::StringRef);
 };
-} // namespace cir
+} // namespace CIRGen
+
+} // namespace clang
 
 #endif // LLVM_CLANG_LIB_CIR_CODEGEN_CIRGENMODULE_H
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
index 6478e0a..fde9a35 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_CLANG_LIB_CIR_CIRGENTYPECACHE_H
 #define LLVM_CLANG_LIB_CIR_CIRGENTYPECACHE_H
 
-namespace cir {
+namespace clang::CIRGen {
 
 /// This structure provides a set of types that are commonly used
 /// during IR emission. It's initialized once in CodeGenModule's
@@ -22,6 +22,6 @@ struct CIRGenTypeCache {
   CIRGenTypeCache() = default;
 };
 
-} // namespace cir
+} // namespace clang::CIRGen
 
 #endif // LLVM_CLANG_LIB_CIR_CODEGEN_CIRGENTYPECACHE_H
diff --git a/clang/lib/CIR/CodeGen/CIRGenerator.cpp b/clang/lib/CIR/CodeGen/CIRGenerator.cpp
index 152124a..85367a9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenerator.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenerator.cpp
@@ -35,9 +35,9 @@ void CIRGenerator::Initialize(ASTContext &astCtx) {
   this->astCtx = &astCtx;
 
   mlirCtx = std::make_unique<mlir::MLIRContext>();
-  mlirCtx->loadDialect<mlir::cir::CIRDialect>();
-  cgm = std::make_unique<CIRGenModule>(*mlirCtx.get(), astCtx, codeGenOpts,
-                                       diags);
+  mlirCtx->loadDialect<cir::CIRDialect>();
+  cgm = std::make_unique<clang::CIRGen::CIRGenModule>(*mlirCtx.get(), astCtx,
+                                                      codeGenOpts, diags);
 }
 
 mlir::ModuleOp CIRGenerator::getModule() const { return cgm->getModule(); }
diff --git a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
index 6d74d72..7d42da1 100644
--- a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
@@ -13,7 +13,7 @@
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 
 using namespace mlir;
-using namespace mlir::cir;
+using namespace cir;
 
 //===----------------------------------------------------------------------===//
 // General CIR parsing / printing
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index e0b38a2..f666e5a 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -17,13 +17,13 @@
 #include "clang/CIR/Dialect/IR/CIROpsDialect.cpp.inc"
 
 using namespace mlir;
-using namespace mlir::cir;
+using namespace cir;
 
 //===----------------------------------------------------------------------===//
 // CIR Dialect
 //===----------------------------------------------------------------------===//
 
-void mlir::cir::CIRDialect::initialize() {
+void cir::CIRDialect::initialize() {
   registerTypes();
   registerAttributes();
   addOperations<
@@ -36,8 +36,8 @@ void mlir::cir::CIRDialect::initialize() {
 // FuncOp
 //===----------------------------------------------------------------------===//
 
-void mlir::cir::FuncOp::build(OpBuilder &builder, OperationState &result,
-                              StringRef name) {
+void cir::FuncOp::build(OpBuilder &builder, OperationState &result,
+                        StringRef name) {
   result.addAttribute(SymbolTable::getSymbolAttrName(),
                       builder.getStringAttr(name));
 }
@@ -56,7 +56,7 @@ void cir::FuncOp::print(OpAsmPrinter &p) {
   p.printSymbolName(getSymName());
 }
 
-mlir::LogicalResult mlir::cir::FuncOp::verify() { return success(); }
+mlir::LogicalResult cir::FuncOp::verify() { return success(); }
 
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
index 167c237..4eeb70f 100644
--- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
@@ -13,7 +13,7 @@
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 
 using namespace mlir;
-using namespace mlir::cir;
+using namespace cir;
 
 //===----------------------------------------------------------------------===//
 // General CIR parsing / printing
diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index 60fde03..3f95a1e 100644
--- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -67,7 +67,7 @@ CreateFrontendBaseAction(CompilerInstance &CI) {
   case EmitBC:                 return std::make_unique<EmitBCAction>();
   case EmitCIR:
 #if CLANG_ENABLE_CIR
-    return std::make_unique<::cir::EmitCIRAction>();
+    return std::make_unique<cir::EmitCIRAction>();
 #else
     llvm_unreachable("CIR suppport not built into clang");
 #endif
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index de453cc..78e8a42 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4626,7 +4626,8 @@ _mm_movepi64_pi64(__m128i __a) {
 ///    A 64-bit value.
 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
 ///    the operand. The upper 64 bits are assigned zeros.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movpi64_epi64(__m64 __a) {
   return __builtin_shufflevector((__v1di)__a, _mm_setzero_si64(), 0, 1);
 }
 
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 0347c5c..dc0fa5c 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -49,6 +49,12 @@ typedef char __v16qi __attribute__((__vector_size__(16)));
                  __min_vector_width__(128)))
 #endif
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
+#else
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
+#endif
+
 #define __trunc64(x)                                                           \
   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
 #define __anyext128(x)                                                         \
@@ -1332,10 +1338,9 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_setzero_si64(void)
-{
-    return __extension__ (__m64){ 0LL };
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_setzero_si64(void) {
+  return __extension__(__m64){0LL};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1353,10 +1358,9 @@ _mm_setzero_si64(void)
 ///    A 32-bit integer value used to initialize the lower 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_set_pi32(int __i1, int __i0)
-{
-    return __extension__ (__m64)(__v2si){__i0, __i1};
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_set_pi32(int __i1, int __i0) {
+  return __extension__(__m64)(__v2si){__i0, __i1};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1376,10 +1380,9 @@ _mm_set_pi32(int __i1, int __i0)
 /// \param __s0
 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
-{
-    return __extension__ (__m64)(__v4hi){__s0, __s1, __s2, __s3};
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_set_pi16(short __s3, short __s2, short __s1, short __s0) {
+  return __extension__(__m64)(__v4hi){__s0, __s1, __s2, __s3};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1407,12 +1410,11 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 /// \param __b0
 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
-            char __b1, char __b0)
-{
-    return __extension__ (__m64)(__v8qi){__b0, __b1, __b2, __b3,
-                                         __b4, __b5, __b6, __b7};
+            char __b1, char __b0) {
+  return __extension__(__m64)(__v8qi){__b0, __b1, __b2, __b3,
+                                      __b4, __b5, __b6, __b7};
 }
 
 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
@@ -1428,10 +1430,9 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
 ///    A 32-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [2 x i32].
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_set1_pi32(int __i)
-{
-    return _mm_set_pi32(__i, __i);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_set1_pi32(int __i) {
+  return _mm_set_pi32(__i, __i);
 }
 
 /// Constructs a 64-bit integer vector of [4 x i16], with each of the
@@ -1447,10 +1448,9 @@ _mm_set1_pi32(int __i)
 ///    A 16-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [4 x i16].
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_set1_pi16(short __w)
-{
-    return _mm_set_pi16(__w, __w, __w, __w);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_set1_pi16(short __w) {
+  return _mm_set_pi16(__w, __w, __w, __w);
 }
 
 /// Constructs a 64-bit integer vector of [8 x i8], with each of the
@@ -1465,10 +1465,9 @@ _mm_set1_pi16(short __w)
 ///    An 8-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [8 x i8].
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_set1_pi8(char __b)
-{
-    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_set1_pi8(char __b) {
+  return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
 }
 
 /// Constructs a 64-bit integer vector, initialized in reverse order with
@@ -1486,10 +1485,9 @@ _mm_set1_pi8(char __b)
 ///    A 32-bit integer value used to initialize the upper 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_setr_pi32(int __i0, int __i1)
-{
-    return _mm_set_pi32(__i1, __i0);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_setr_pi32(int __i0, int __i1) {
+  return _mm_set_pi32(__i1, __i0);
 }
 
 /// Constructs a 64-bit integer vector, initialized in reverse order with
@@ -1509,10 +1507,9 @@ _mm_setr_pi32(int __i0, int __i1)
 /// \param __w3
 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
-{
-    return _mm_set_pi16(__w3, __w2, __w1, __w0);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
+  return _mm_set_pi16(__w3, __w2, __w1, __w0);
 }
 
 /// Constructs a 64-bit integer vector, initialized in reverse order with
@@ -1540,11 +1537,10 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 /// \param __b7
 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
-             char __b6, char __b7)
-{
-    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+             char __b6, char __b7) {
+  return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 }
 
 #undef __anyext128
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 6b0b484..61c29e3 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -19365,11 +19365,12 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl,
       }
 
       // Structs without named members are extension in C (C99 6.7.2.1p7),
-      // but are accepted by GCC.
-      if (NonBitFields == 0 && !getLangOpts().CPlusPlus) {
-        Diag(RecLoc, IsEmpty ? diag::ext_empty_struct_union :
-                               diag::ext_no_named_members_in_struct_union)
-          << Record->isUnion();
+      // but are accepted by GCC. In C2y, this became implementation-defined
+      // (C2y 6.7.3.2p10).
+      if (NonBitFields == 0 && !getLangOpts().CPlusPlus && !getLangOpts().C2y) {
+        Diag(RecLoc, IsEmpty ? diag::ext_empty_struct_union
+                             : diag::ext_no_named_members_in_struct_union)
+            << Record->isUnion();
       }
     }
   } else {
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index b970640..25427bf 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -1538,7 +1538,6 @@ SemaOpenACC::AssociatedStmtRAII::AssociatedStmtRAII(
     CollectActiveReductionClauses(S.ActiveReductionClauses, Clauses);
     SemaRef.ActiveComputeConstructInfo.Kind = DirKind;
     SemaRef.ActiveComputeConstructInfo.Clauses = Clauses;
-    SemaRef.ParentlessLoopConstructs.swap(ParentlessLoopConstructs);
 
     // OpenACC 3.3 2.9.2: When the parent compute construct is a kernels
     // construct, the gang clause behaves as follows. ... The region of a loop
@@ -1668,9 +1667,8 @@ SemaOpenACC::AssociatedStmtRAII::~AssociatedStmtRAII() {
   if (DirKind == OpenACCDirectiveKind::Parallel ||
       DirKind == OpenACCDirectiveKind::Serial ||
       DirKind == OpenACCDirectiveKind::Kernels) {
-    assert(SemaRef.ParentlessLoopConstructs.empty() &&
-           "Didn't consume loop construct list?");
-    SemaRef.ParentlessLoopConstructs.swap(ParentlessLoopConstructs);
+    // Nothing really to do here, the restorations above should be enough for
+    // now.
   } else if (DirKind == OpenACCDirectiveKind::Loop) {
     // Nothing really to do here, the LoopInConstruct should handle restorations
     // correctly.
@@ -3171,27 +3169,14 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K,
   case OpenACCDirectiveKind::Parallel:
   case OpenACCDirectiveKind::Serial:
   case OpenACCDirectiveKind::Kernels: {
-    auto *ComputeConstruct = OpenACCComputeConstruct::Create(
+    return OpenACCComputeConstruct::Create(
         getASTContext(), K, StartLoc, DirLoc, EndLoc, Clauses,
-        AssocStmt.isUsable() ? AssocStmt.get() : nullptr,
-        ParentlessLoopConstructs);
-
-    ParentlessLoopConstructs.clear();
-
-    return ComputeConstruct;
+        AssocStmt.isUsable() ? AssocStmt.get() : nullptr);
   }
   case OpenACCDirectiveKind::Loop: {
-    auto *LoopConstruct = OpenACCLoopConstruct::Create(
-        getASTContext(), StartLoc, DirLoc, EndLoc, Clauses,
-        AssocStmt.isUsable() ? AssocStmt.get() : nullptr);
-
-    // If we are in the scope of a compute construct, add this to the list of
-    // loop constructs that need assigning to the next closing compute
-    // construct.
-    if (isInComputeConstruct())
-      ParentlessLoopConstructs.push_back(LoopConstruct);
-
-    return LoopConstruct;
+    return OpenACCLoopConstruct::Create(
+        getASTContext(), ActiveComputeConstructInfo.Kind, StartLoc, DirLoc,
+        EndLoc, Clauses, AssocStmt.isUsable() ? AssocStmt.get() : nullptr);
   }
   }
   llvm_unreachable("Unhandled case in directive handling?");
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index e526a11..515b9f6 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -1500,16 +1500,19 @@ static QualType ConvertDeclSpecToType(TypeProcessingState &state) {
     // C99 6.7.3p8:
     //   If the specification of a function type includes any type qualifiers,
     //   the behavior is undefined.
+    // C2y changed this behavior to be implementation-defined. Clang defines
+    // the behavior in all cases to ignore the qualifier, as in C++.
     // C++11 [dcl.fct]p7:
     //   The effect of a cv-qualifier-seq in a function declarator is not the
     //   same as adding cv-qualification on top of the function type. In the
     //   latter case, the cv-qualifiers are ignored.
     if (Result->isFunctionType()) {
+      unsigned DiagId = diag::warn_typecheck_function_qualifiers_ignored;
+      if (!S.getLangOpts().CPlusPlus && !S.getLangOpts().C2y)
+        DiagId = diag::ext_typecheck_function_qualifiers_unspecified;
       diagnoseAndRemoveTypeQualifiers(
           S, DS, TypeQuals, Result, DeclSpec::TQ_const | DeclSpec::TQ_volatile,
-          S.getLangOpts().CPlusPlus
-              ? diag::warn_typecheck_function_qualifiers_ignored
-              : diag::warn_typecheck_function_qualifiers_unspecified);
+          DiagId);
       // No diagnostic for 'restrict' or '_Atomic' applied to a
       // function type; we'll diagnose those later, in BuildQualifiedType.
     }
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 6aaafb2..df54cb0 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2836,12 +2836,12 @@ void ASTStmtReader::VisitOpenACCAssociatedStmtConstruct(
 void ASTStmtReader::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) {
   VisitStmt(S);
   VisitOpenACCAssociatedStmtConstruct(S);
-  S->findAndSetChildLoops();
 }
 
 void ASTStmtReader::VisitOpenACCLoopConstruct(OpenACCLoopConstruct *S) {
   VisitStmt(S);
   VisitOpenACCAssociatedStmtConstruct(S);
+  S->ParentComputeConstructKind = Record.readEnum<OpenACCDirectiveKind>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 7f700c2..326c2ef 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2915,6 +2915,7 @@ void ASTStmtWriter::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) {
 void ASTStmtWriter::VisitOpenACCLoopConstruct(OpenACCLoopConstruct *S) {
   VisitStmt(S);
   VisitOpenACCAssociatedStmtConstruct(S);
+  Record.writeEnum(S->getParentComputeConstructKind());
   Code = serialization::STMT_OPENACC_LOOP_CONSTRUCT;
 }
 
diff --git a/clang/test/C/C2y/n3341.c b/clang/test/C/C2y/n3341.c
new file mode 100644
index 0000000..4cff7f0
--- /dev/null
+++ b/clang/test/C/C2y/n3341.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
+// RUN: %clang_cc1 -verify=gnu -Wall -pedantic %s
+
+/* WG14 N3341: Yes
+ * Slay Some Earthly Demons III
+ *
+ * Empty structure and union objects are now implementation-defined.
+ */
+
+// expected-no-diagnostics
+
+struct R {};               // gnu-warning {{empty struct is a GNU extension}}
+#if __STDC_VERSION__ >= 201112L
+struct S { struct { }; };  // gnu-warning {{empty struct is a GNU extension}}
+#endif
+struct T { int : 0; };     // gnu-warning {{struct without named members is a GNU extension}}
+union U {};                // gnu-warning {{empty union is a GNU extension}}
+
diff --git a/clang/test/C/C2y/n3342.c b/clang/test/C/C2y/n3342.c
new file mode 100644
index 0000000..9df939c60
--- /dev/null
+++ b/clang/test/C/C2y/n3342.c
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -verify=expected,both -std=c2y -Wall -pedantic %s
+// RUN: %clang_cc1 -verify=clang,both -Wall -pedantic %s
+
+/* WG14 N3342: Yes
+ * Slay Some Earthly Demons IV
+ *
+ * Qualified function types are now implementation-defined instead of
+ * undefined. Clang strips the qualifiers.
+ */
+
+typedef int f(void);
+
+const f one;      /* expected-warning {{'const' qualifier on function type 'f' (aka 'int (void)') has no effect}}
+                     clang-warning {{'const' qualifier on function type 'f' (aka 'int (void)') has no effect and is a Clang extension}}
+                   */
+volatile f two;   /* expected-warning {{'volatile' qualifier on function type 'f' (aka 'int (void)') has no effect}}
+                     clang-warning {{'volatile' qualifier on function type 'f' (aka 'int (void)') has no effect and is a Clang extension}}
+                   */
+
+const volatile f three; /* expected-warning {{'const' qualifier on function type 'f' (aka 'int (void)') has no effect}}
+                           clang-warning {{'const' qualifier on function type 'f' (aka 'int (void)') has no effect and is a Clang extension}}
+                           expected-warning {{'volatile' qualifier on function type 'f' (aka 'int (void)') has no effect}}
+                           clang-warning {{'volatile' qualifier on function type 'f' (aka 'int (void)') has no effect and is a Clang extension}}
+                         */
+
+#if __STDC_VERSION__ >= 201112L
+// Atomic types have an explicit constraint making it ill-formed.
+_Atomic f four;   // both-error {{_Atomic cannot be applied to function type 'f' (aka 'int (void)')}}
+#endif
+
+// There's no point to testing 'restrict' because that requires a pointer type.
diff --git a/clang/test/C/C2y/n3346.c b/clang/test/C/C2y/n3346.c
new file mode 100644
index 0000000..a6fc332
--- /dev/null
+++ b/clang/test/C/C2y/n3346.c
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic -ffreestanding %s
+// RUN: %clang_cc1 -std=c99 -verify=expected,ped -Wall -pedantic -ffreestanding %s
+
+/* WG14 N3346: Yes
+ * Slay Some Earthly Demons VIII
+ *
+ * Updates some undefined behavior during initialization to instead be a
+ * constraint violation.
+ */
+
+// The initializer for a scalar shall be a single expression, optionally
+// enclosed in braces, or it shall be an empty initializer.
+int i = 12, j = {12}, k = {}; // ped-warning {{use of an empty initializer is a C23 extension}}
+
+struct S {
+  int i;
+  float f;
+  int : 0;
+  char c;
+};
+
+void test1(void) {
+  // The initializer for an object that has structure or union type shall be
+  // either a single expression that has compatible type or a brace-enclosed
+  // list of initializers for the elements or named members.
+  struct S s1 = { 1, 1.2f, 'a' };
+  struct S s2 = s1;
+
+  // Despite being structurally identical to S, T is not compatible with S.
+  struct T { int i; float f; int : 0; char c; } t;
+  struct S s3 = t; // expected-error {{initializing 'struct S' with an expression of incompatible type 'struct T'}}
+}
+
+void test2(void) {
+  typedef __WCHAR_TYPE__ wchar_t;
+
+  // The initializer for an array shall be either a string literal, optionally
+  // enclosed in braces, or a brace-enclosed list of initializers for the
+  // elements. An array initialized by character string literal or UTF-8 string
+  // literal shall have a character type as element type. An array initialized
+  // with a wide string literal shall have element type compatible with a
+  // qualified or unqualified wchar_t, char16_t, or char32_t, and the string
+  // literal shall have the corresponding encoding prefix (L, u, or U,
+  // respectively).
+  char str1[] = "string literal";
+  char str2[] = { "string literal" };
+
+  float str5[] = "this doesn't work";          // expected-error {{array initializer must be an initializer list}}
+  float str6[] = { "this also doesn't work" }; // expected-error {{initializing 'float' with an expression of incompatible type 'char[23]'}}
+
+  wchar_t str7[] = L"string literal";
+  wchar_t str8[] = { L"string literal" };
+
+#if __STDC_VERSION__ >= 201112L
+  typedef __CHAR16_TYPE__ char16_t;
+  typedef __CHAR32_TYPE__ char32_t;
+
+  char str3[] = u8"string literal";
+  char str4[] = { u8"string literal" };
+
+  char16_t str9[] = u"string literal";
+  char16_t str10[] = { u"string literal" };
+  char32_t str11[] = U"string literal";
+  char32_t str12[] = { U"string literal" };
+
+  char16_t str15[] = "nope";     // expected-error {{initializing wide char array with non-wide string literal}}
+  char16_t str16[] = { "nope" }; // expected-error-re {{incompatible pointer to integer conversion initializing 'char16_t' (aka '{{.*}}') with an expression of type 'char[5]'}}
+  char32_t str17[] = "nope";     // expected-error {{initializing wide char array with non-wide string literal}}
+  char32_t str18[] = { "nope" }; // expected-error-re {{incompatible pointer to integer conversion initializing 'char32_t' (aka '{{.*}}') with an expression of type 'char[5]'}}
+#endif
+
+  wchar_t str13[] = "nope";      // expected-error {{initializing wide char array with non-wide string literal}}
+  wchar_t str14[] = { "nope" };  // expected-error-re {{incompatible pointer to integer conversion initializing 'wchar_t' (aka '{{.*}}') with an expression of type 'char[5]'}}
+}
diff --git a/clang/test/CodeGen/X86/builtin_test_helpers.h b/clang/test/CodeGen/X86/builtin_test_helpers.h
index 0edec14..c3307f7 100644
--- a/clang/test/CodeGen/X86/builtin_test_helpers.h
+++ b/clang/test/CodeGen/X86/builtin_test_helpers.h
@@ -9,6 +9,25 @@ constexpr bool match_m64(__m64 _v, unsigned long long a) {
   return v[0] == a;
 }
 
+constexpr bool match_v1di(__m64 v, long long a) {
+  return v[0] == a;
+}
+
+constexpr bool match_v2si(__m64 _v, int a, int b) {
+  __v2si v = (__v2si)_v;
+  return v[0] == a && v[1] == b;
+}
+
+constexpr bool match_v4hi(__m64 _v, short a, short b, short c, short d) {
+  __v4hi v = (__v4hi)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d;
+}
+
+constexpr bool match_v8qi(__m64 _v, char a, char b, char c, char d, char e, char f, char g, char h) {
+  __v8qi v = (__v8qi)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] == f && v[6] == g && v[7] == h;
+}
+
 constexpr bool match_m128(__m128 v, float a, float b, float c, float d) {
   return v[0] == a && v[1] == b && v[2] == c && v[3] == d;
 }
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 50f47e1..52cbe45 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -9,6 +9,7 @@
 
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m64 test_mm_abs_pi8(__m64 a) {
   // CHECK-LABEL: test_mm_abs_pi8
@@ -409,6 +410,7 @@ __m64 test_mm_set_pi8(char a, char b, char c, char d, char e, char f, char g, ch
   // CHECK: insertelement <8 x i8>
   return _mm_set_pi8(a, b, c, d, e, f, g, h);
 }
+TEST_CONSTEXPR(match_v8qi(_mm_set_pi8(0, -1, 2, -3, 4, -5, 6, -7), -7, 6, -5, 4, -3, 2, -1, 0));
 
 __m64 test_mm_set_pi16(short a, short b, short c, short d) {
   // CHECK-LABEL: test_mm_set_pi16
@@ -418,6 +420,7 @@ __m64 test_mm_set_pi16(short a, short b, short c, short d) {
   // CHECK: insertelement <4 x i16>
   return _mm_set_pi16(a, b, c, d);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_set_pi16(101, 102, -103, -104), -104, -103, 102, 101));
 
 __m64 test_mm_set_pi32(int a, int b) {
   // CHECK-LABEL: test_mm_set_pi32
@@ -425,6 +428,7 @@ __m64 test_mm_set_pi32(int a, int b) {
   // CHECK: insertelement <2 x i32>
   return _mm_set_pi32(a, b);
 }
+TEST_CONSTEXPR(match_v2si(_mm_set_pi32(5000, -1500), -1500, 5000));
 
 __m64 test_mm_setr_pi8(char a, char b, char c, char d, char e, char f, char g, char h) {
   // CHECK-LABEL: test_mm_setr_pi8
@@ -438,6 +442,7 @@ __m64 test_mm_setr_pi8(char a, char b, char c, char d, char e, char f, char g, c
   // CHECK: insertelement <8 x i8>
   return _mm_setr_pi8(a, b, c, d, e, f, g, h);
 }
+TEST_CONSTEXPR(match_v8qi(_mm_setr_pi8(0, -1, 2, -3, 4, -5, 6, -7), 0, -1, 2, -3, 4, -5, 6, -7));
 
 __m64 test_mm_setr_pi16(short a, short b, short c, short d) {
   // CHECK-LABEL: test_mm_setr_pi16
@@ -447,6 +452,7 @@ __m64 test_mm_setr_pi16(short a, short b, short c, short d) {
   // CHECK: insertelement <4 x i16>
   return _mm_setr_pi16(a, b, c, d);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_setr_pi16(101, 102, -103, -104), 101, 102, -103, -104));
 
 __m64 test_mm_setr_pi32(int a, int b) {
   // CHECK-LABEL: test_mm_setr_pi32
@@ -454,6 +460,14 @@ __m64 test_mm_setr_pi32(int a, int b) {
   // CHECK: insertelement <2 x i32>
   return _mm_setr_pi32(a, b);
 }
+TEST_CONSTEXPR(match_v2si(_mm_setr_pi32(5000, -1500), 5000, -1500));
+
+__m64 test_mm_setzero_si64() {
+  // CHECK-LABEL: test_mm_setzero_si64
+  // CHECK: zeroinitializer
+  return _mm_setzero_si64();
+}
+TEST_CONSTEXPR(match_m64(_mm_setzero_si64(), 0ULL));
 
 __m64 test_mm_set1_pi8(char a) {
   // CHECK-LABEL: test_mm_set1_pi8
@@ -467,6 +481,7 @@ __m64 test_mm_set1_pi8(char a) {
   // CHECK: insertelement <8 x i8>
   return _mm_set1_pi8(a);
 }
+TEST_CONSTEXPR(match_v8qi(_mm_set1_pi8(99), 99, 99, 99, 99, 99, 99, 99, 99));
 
 __m64 test_mm_set1_pi16(short a) {
   // CHECK-LABEL: test_mm_set1_pi16
@@ -476,6 +491,7 @@ __m64 test_mm_set1_pi16(short a) {
   // CHECK: insertelement <4 x i16>
   return _mm_set1_pi16(a);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_set1_pi16(-128), -128, -128, -128, -128));
 
 __m64 test_mm_set1_pi32(int a) {
   // CHECK-LABEL: test_mm_set1_pi32
@@ -483,6 +499,7 @@ __m64 test_mm_set1_pi32(int a) {
   // CHECK: insertelement <2 x i32>
   return _mm_set1_pi32(a);
 }
+TEST_CONSTEXPR(match_v2si(_mm_set1_pi32(55), 55, 55));
 
 __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_shuffle_pi8
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 6defd94..04df59e 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -867,20 +867,19 @@ __m128d test_mm_min_sd(__m128d A, __m128d B) {
   return _mm_min_sd(A, B);
 }
 
-__m64 test_mm_movepi64_pi64(__m128i A)
-{
+__m64 test_mm_movepi64_pi64(__m128i A) {
   // CHECK-LABEL: test_mm_movepi64_pi64
   // CHECK: [[EXT:%.*]] = extractelement <2 x i64> %1, i32 0
   return _mm_movepi64_pi64(A);
 }
 TEST_CONSTEXPR(match_m64(_mm_movepi64_pi64((__m128i){8, -8}), 8ULL));
 
-__m128i test_mm_movpi64_epi64(__m64 A)
-{
+__m128i test_mm_movpi64_epi64(__m64 A) {
   // CHECK-LABEL: test_mm_movpi64_epi64
   // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
   return _mm_movpi64_epi64(A);
 }
+TEST_CONSTEXPR(match_m128i(_mm_movpi64_epi64((__m64){5LL}), 5ULL, 0ULL));
 
 __m128i test_mm_move_epi64(__m128i A) {
   // CHECK-LABEL: test_mm_move_epi64
diff --git a/clang/test/CodeGen/aarch64-fmv-dependencies.c b/clang/test/CodeGen/aarch64-fmv-dependencies.c
index e1105f3..137f64d 100644
--- a/clang/test/CodeGen/aarch64-fmv-dependencies.c
+++ b/clang/test/CodeGen/aarch64-fmv-dependencies.c
@@ -189,7 +189,7 @@ int caller() {
 // CHECK: attributes #[[ssbs]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+ssbs,+v8a"
 // CHECK: attributes #[[sve]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
 // CHECK: attributes #[[sve2]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a"
-// CHECK: attributes #[[sve2_aes]] = { {{.*}} "target-features"="+aes,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-aes,+sve2,+v8a"
+// CHECK: attributes #[[sve2_aes]] = { {{.*}} "target-features"="+aes,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-aes,+v8a"
 // CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-bitperm,+v8a"
 // CHECK: attributes #[[sve2_sha3]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-sha3,+v8a"
 // CHECK: attributes #[[sve2_sm4]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-sm4,+v8a"
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c
index 0839b32..5ea27aa 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c
index 08ca748..9442d14 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c
index 78d3deb..23f838c 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c
index 48d1301..575c09c 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c
index 09583f98..a4935d8 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c
index a4ffc31..a712a4f 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/Driver/aarch64-implied-sve-features.c b/clang/test/Driver/aarch64-implied-sve-features.c
index 2bb0ee8..f04e1a7 100644
--- a/clang/test/Driver/aarch64-implied-sve-features.c
+++ b/clang/test/Driver/aarch64-implied-sve-features.c
@@ -36,7 +36,7 @@
 // SVE2-BITPERM-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-bitperm"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-aes+nosve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-AES-REVERT
-// SVE2-AES-REVERT: "-target-feature" "+sve" "-target-feature" "+sve-aes" "-target-feature" "+sve2" "-target-feature" "-sve2-aes"
+// SVE2-AES-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-aes"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sha3+nosve2-sha3 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SHA3-REVERT
 // SVE2-SHA3-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-sha3"
@@ -47,11 +47,8 @@
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sha3 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SHA3
 // SVE2-SHA3: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-sha3"
 
-// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE-AES
-// SVE-AES: "-target-feature" "+aes"{{.*}} "-target-feature" "+sve-aes"
-
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-AES
-// SVE2-AES: "-target-feature" "+sve" "-target-feature" "+sve-aes" "-target-feature" "+sve2" "-target-feature" "+sve2-aes"
+// SVE2-AES: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-aes"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sm4 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SM4
 // SVE2-SM4: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-sm4"
@@ -69,7 +66,7 @@
 // SVE-SUBFEATURE-CONFLICT-NOT: "-target-feature" "+sve"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+nosve+sve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE-SUBFEATURE-CONFLICT-REV
-// SVE-SUBFEATURE-CONFLICT-REV: "-target-feature" "+sve" "-target-feature" "+sve-aes" "-target-feature" "+sve2" "-target-feature" "+sve2-aes"
+// SVE-SUBFEATURE-CONFLICT-REV: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-aes"
 
 // RUN: %clang --target=aarch64-linux-gnu -mcpu=neoverse-n2+nosve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE-MCPU-FEATURES
 // SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve2-bitperm"
diff --git a/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip b/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip
deleted file mode 100644
index aca5915..0000000
--- a/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip
+++ /dev/null
@@ -1,111 +0,0 @@
-// REQUIRES: amdgpu-registered-target
-// RUN: %clang -xhip --offload-arch=gfx1030 --offload-host-only -pedantic -nogpuinc -nogpulib -nobuiltininc -fsyntax-only -Xclang -verify %s
-// RUN: %clang -xhip --offload-arch=gfx1030 --offload-device-only -pedantic -nogpuinc -nogpulib -nobuiltininc -fsyntax-only -Xclang -verify %s
-
-// Test that deprecation warnings for the wavefront size macro are emitted properly.
-
-#include <type_traits>
-
-#define WRAPPED __AMDGCN_WAVEFRONT_SIZE__
-
-#define DOUBLE_WRAPPED (WRAPPED)
-
-__attribute__((host, device)) void use(int, const char*);
-
-template<int N> __attribute__((host, device)) int templatify(int x) {
-    return x + N;
-}
-
-__attribute__((device)) const int GlobalConst = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-constexpr int GlobalConstExpr = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-
-#if defined(__HIP_DEVICE_COMPILE__) && (__AMDGCN_WAVEFRONT_SIZE__ == 64) // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-int foo(void);
-#endif
-
-__attribute__((device)) int device_var = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-
-__attribute__((device))
-void device_fun() {
-    use(__AMDGCN_WAVEFRONT_SIZE, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}}
-    use(__AMDGCN_WAVEFRONT_SIZE__, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(WRAPPED, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(DOUBLE_WRAPPED, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(GlobalConst, "device function");
-    use(GlobalConstExpr, "device function");
-}
-
-__attribute__((global))
-void global_fun() {
-    // no warnings expected
-    use(__AMDGCN_WAVEFRONT_SIZE, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}}
-    use(__AMDGCN_WAVEFRONT_SIZE__, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(WRAPPED, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(DOUBLE_WRAPPED, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-}
-
-int host_var = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-int host_var_alt = __AMDGCN_WAVEFRONT_SIZE; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}}
-int host_var_wrapped = WRAPPED; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-int host_var_double_wrapped = DOUBLE_WRAPPED; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-
-__attribute__((host))
-void host_fun() {
-    use(__AMDGCN_WAVEFRONT_SIZE, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}}
-    use(__AMDGCN_WAVEFRONT_SIZE__, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(WRAPPED, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(DOUBLE_WRAPPED, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(GlobalConst, "host function");
-    use(GlobalConstExpr, "host function");
-}
-
-__attribute((host, device))
-void host_device_fun() {
-    use(__AMDGCN_WAVEFRONT_SIZE__, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(WRAPPED, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(DOUBLE_WRAPPED, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-}
-
-template <unsigned int OuterWarpSize = __AMDGCN_WAVEFRONT_SIZE__> // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-class FunSelector {
-public:
-    template<unsigned int FunWarpSize = OuterWarpSize>
-    __attribute__((device))
-    auto fun(void)
-        -> typename std::enable_if<(FunWarpSize <= __AMDGCN_WAVEFRONT_SIZE__), void>::type // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    {
-        use(1, "yay!");
-    }
-
-    template<unsigned int FunWarpSize = OuterWarpSize>
-    __attribute__((device))
-    auto fun(void)
-        -> typename std::enable_if<(FunWarpSize > __AMDGCN_WAVEFRONT_SIZE__), void>::type // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    {
-        use(0, "nay!");
-    }
-};
-
-__attribute__((device))
-void device_fun_selector_user() {
-    FunSelector<> f;
-    f.fun<>();
-    f.fun<1>();
-    f.fun<1000>();
-
-    std::enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type x = 42; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-}
-
-__attribute__((device)) std::enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type DeviceFunTemplateRet(void) { // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    return 42;
-}
-
-__attribute__((device)) int DeviceFunTemplateArg(std::enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type x) { // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    return x;
-}
-
-// expected-note@* 0+ {{macro marked 'deprecated' here}}
diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c
index 0396718..03eacf9 100644
--- a/clang/test/Driver/print-supported-extensions-aarch64.c
+++ b/clang/test/Driver/print-supported-extensions-aarch64.c
@@ -77,18 +77,17 @@
 // CHECK-NEXT:     profile             FEAT_SPE                                               Enable Statistical Profiling extension
 // CHECK-NEXT:     predres2            FEAT_SPECRES2                                          Enable Speculation Restriction Instruction
 // CHECK-NEXT:     ssbs                FEAT_SSBS, FEAT_SSBS2                                  Enable Speculative Store Bypass Safe bit
-// CHECK-NEXT:     ssve-aes            FEAT_SSVE_AES                                          Enable Armv9.6-A SVE AES support in streaming SVE mode
+// CHECK-NEXT:     ssve-aes            FEAT_SSVE_AES                                          Enable Armv9.6-A SVE2 AES support in streaming SVE mode
 // CHECK-NEXT:     ssve-fp8dot2        FEAT_SSVE_FP8DOT2                                      Enable SVE2 FP8 2-way dot product instructions
 // CHECK-NEXT:     ssve-fp8dot4        FEAT_SSVE_FP8DOT4                                      Enable SVE2 FP8 4-way dot product instructions
 // CHECK-NEXT:     ssve-fp8fma         FEAT_SSVE_FP8FMA                                       Enable SVE2 FP8 multiply-add instructions
 // CHECK-NEXT:     sve                 FEAT_SVE                                               Enable Scalable Vector Extension (SVE) instructions
-// CHECK-NEXT:     sve-aes             FEAT_SVE_AES, FEAT_SVE_PMULL128                        Enable SVE AES and quadword SVE polynomial multiply instructions
-// CHECK-NEXT:     sve-aes2            FEAT_SVE_AES2                                          Enable Armv9.6-A SVE multi-vector AES and multi-vector quadword polynomial multiply instructions
+// CHECK-NEXT:     sve-aes2            FEAT_SVE_AES2                                          Enable Armv9.6-A SVE multi-vector AES and 128-bit PMULL instructions
 // CHECK-NEXT:     sve-b16b16          FEAT_SVE_B16B16                                        Enable SVE2 non-widening and SME2 Z-targeting non-widening BFloat16 instructions
 // CHECK-NEXT:     sve-bfscale         FEAT_SVE_BFSCALE                                       Enable Armv9.6-A SVE BFloat16 scaling instructions
 // CHECK-NEXT:     sve-f16f32mm        FEAT_SVE_F16F32MM                                      Enable Armv9.6-A FP16 to FP32 Matrix Multiply
 // CHECK-NEXT:     sve2                FEAT_SVE2                                              Enable Scalable Vector Extension 2 (SVE2) instructions
-// CHECK-NEXT:     sve2-aes                                                                   An alias of +sve2+sve-aes
+// CHECK-NEXT:     sve2-aes            FEAT_SVE_AES, FEAT_SVE_PMULL128                        Enable AES SVE2 instructions
 // CHECK-NEXT:     sve2-bitperm        FEAT_SVE_BitPerm                                       Enable bit permutation SVE2 instructions
 // CHECK-NEXT:     sve2-sha3           FEAT_SVE_SHA3                                          Enable SHA3 SVE2 instructions
 // CHECK-NEXT:     sve2-sm4            FEAT_SVE_SM4                                           Enable SM4 SVE2 instructions
diff --git a/clang/test/Misc/warning-flags.c b/clang/test/Misc/warning-flags.c
index 80c0109..1fd0244 100644
--- a/clang/test/Misc/warning-flags.c
+++ b/clang/test/Misc/warning-flags.c
@@ -18,7 +18,7 @@ This test serves two purposes:
 
 The list of warnings below should NEVER grow.  It should gradually shrink to 0.
 
-CHECK: Warnings without flags (62):
+CHECK: Warnings without flags (61):
 
 CHECK-NEXT:   ext_expected_semi_decl_list
 CHECK-NEXT:   ext_missing_whitespace_after_macro_name
@@ -76,7 +76,6 @@ CHECK-NEXT:   warn_register_objc_catch_parm
 CHECK-NEXT:   warn_related_result_type_compatibility_class
 CHECK-NEXT:   warn_related_result_type_compatibility_protocol
 CHECK-NEXT:   warn_template_export_unsupported
-CHECK-NEXT:   warn_typecheck_function_qualifiers
 CHECK-NEXT:   warn_undef_interface
 CHECK-NEXT:   warn_undef_interface_suggest
 CHECK-NEXT:   warn_undef_protocolref
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index fc786f4..418430b 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -227,20 +227,8 @@
 // CHECK-NONEON-NOT: __ARM_FEATURE_SVE 1
 // CHECK-NONEON-NOT: __ARM_NEON 1
 
-// RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve-aes -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVEAES %s
-// CHECK-SVEAES: __ARM_FEATURE_AES 1
-
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-aes -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2AES %s
-// CHECK-SVE2AES: __ARM_FEATURE_AES 1
-// CHECK-SVE2AES: __ARM_FEATURE_SVE2 1
 // CHECK-SVE2AES: __ARM_FEATURE_SVE2_AES 1
-
-// RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve-aes+sve2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVEAES-SVE2 %s
-// CHECK-SVEAES-SVE2: __ARM_FEATURE_AES 1
-// CHECK-SVEAES-SVE2: __ARM_FEATURE_SVE2 1
-// CHECK-SVEAES-SVE2: __ARM_FEATURE_SVE2_AES 1
-
-
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-sha3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2SHA3 %s
 // CHECK-SVE2SHA3: __ARM_FEATURE_SVE2_SHA3 1
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-sm4 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2SM4 %s
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp
index 93d4b00..795bb76 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp
@@ -14,17 +14,17 @@
 
 void test(uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64)
 {
-  // expected-error@+2 {{'svaesd_u8' needs target feature sve,sve2,sve-aes}}
-  // overload-error@+1 {{'svaesd' needs target feature sve,sve2,sve-aes}}
+  // expected-error@+2 {{'svaesd_u8' needs target feature sve,sve2-aes}}
+  // overload-error@+1 {{'svaesd' needs target feature sve,sve2-aes}}
   SVE_ACLE_FUNC(svaesd,_u8,,)(svundef_u8(), svundef_u8());
-  // expected-error@+2 {{'svaese_u8' needs target feature sve,sve2,sve-aes}}
-  // overload-error@+1 {{'svaese' needs target feature sve,sve2,sve-aes}}
+  // expected-error@+2 {{'svaese_u8' needs target feature sve,sve2-aes}}
+  // overload-error@+1 {{'svaese' needs target feature sve,sve2-aes}}
   SVE_ACLE_FUNC(svaese,_u8,,)(svundef_u8(), svundef_u8());
-  // expected-error@+2 {{'svaesimc_u8' needs target feature sve,sve2,sve-aes}}
-  // overload-error@+1 {{'svaesimc' needs target feature sve,sve2,sve-aes}}
+  // expected-error@+2 {{'svaesimc_u8' needs target feature sve,sve2-aes}}
+  // overload-error@+1 {{'svaesimc' needs target feature sve,sve2-aes}}
   SVE_ACLE_FUNC(svaesimc,_u8,,)(svundef_u8());
-  // expected-error@+2 {{'svaesmc_u8' needs target feature sve,sve2,sve-aes}}
-  // overload-error@+1 {{'svaesmc' needs target feature sve,sve2,sve-aes}}
+  // expected-error@+2 {{'svaesmc_u8' needs target feature sve,sve2-aes}}
+  // overload-error@+1 {{'svaesmc' needs target feature sve,sve2-aes}}
   SVE_ACLE_FUNC(svaesmc,_u8,,)(svundef_u8());
   // expected-error@+2 {{'svbdep_u8' needs target feature sve,sve2-bitperm}}
   // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}}
@@ -107,17 +107,17 @@ void test(uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64)
   // expected-error@+2 {{'svbgrp_n_u64' needs target feature sve,sve2-bitperm}}
   // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}}
   SVE_ACLE_FUNC(svbgrp,_n_u64,,)(svundef_u64(), u64);
-  // expected-error@+2 {{'svpmullb_pair_u64' needs target feature sve,sve2,sve-aes}}
-  // overload-error@+1 {{'svpmullb_pair' needs target feature sve,sve2,sve-aes}}
+  // expected-error@+2 {{'svpmullb_pair_u64' needs target feature sve,sve2-aes}}
+  // overload-error@+1 {{'svpmullb_pair' needs target feature sve,sve2-aes}}
   SVE_ACLE_FUNC(svpmullb_pair,_u64,,)(svundef_u64(), svundef_u64());
-  // expected-error@+2 {{'svpmullb_pair_n_u64' needs target feature sve,sve2,sve-aes}}
-  // overload-error@+1 {{'svpmullb_pair' needs target feature sve,sve2,sve-aes}}
+  // expected-error@+2 {{'svpmullb_pair_n_u64' needs target feature sve,sve2-aes}}
+  // overload-error@+1 {{'svpmullb_pair' needs target feature sve,sve2-aes}}
   SVE_ACLE_FUNC(svpmullb_pair,_n_u64,,)(svundef_u64(), u64);
-  // expected-error@+2 {{'svpmullt_pair_u64' needs target feature sve,sve2,sve-aes}}
-  // overload-error@+1 {{'svpmullt_pair' needs target feature sve,sve2,sve-aes}}
+  // expected-error@+2 {{'svpmullt_pair_u64' needs target feature sve,sve2-aes}}
+  // overload-error@+1 {{'svpmullt_pair' needs target feature sve,sve2-aes}}
   SVE_ACLE_FUNC(svpmullt_pair,_u64,,)(svundef_u64(), svundef_u64());
-  // expected-error@+2 {{'svpmullt_pair_n_u64' needs target feature sve,sve2,sve-aes}}
-  // overload-error@+1 {{'svpmullt_pair' needs target feature sve,sve2,sve-aes}}
+  // expected-error@+2 {{'svpmullt_pair_n_u64' needs target feature sve,sve2-aes}}
+  // overload-error@+1 {{'svpmullt_pair' needs target feature sve,sve2-aes}}
   SVE_ACLE_FUNC(svpmullt_pair,_n_u64,,)(svundef_u64(), u64);
   // expected-error@+2 {{'svrax1_u64' needs target feature sve,sve2-sha3}}
   // overload-error@+1 {{'svrax1' needs target feature sve,sve2-sha3}}
diff --git a/clang/test/Sema/declspec.c b/clang/test/Sema/declspec.c
index 88ff83a..ca3216b 100644
--- a/clang/test/Sema/declspec.c
+++ b/clang/test/Sema/declspec.c
@@ -16,9 +16,9 @@ int gv2;
 static void buggy(int *x) { }
 
 // Type qualifiers.
-typedef int f(void); 
+typedef int f(void);
 typedef f* fptr;
-const f* v1;         // expected-warning {{qualifier on function type 'f' (aka 'int (void)') has unspecified behavior}}
+const f* v1;         // expected-warning {{'const' qualifier on function type 'f' (aka 'int (void)') has no effect and is a Clang extension}}
 __restrict__ f* v2;  // expected-error {{restrict requires a pointer or reference ('f' (aka 'int (void)') is invalid)}}
 __restrict__ fptr v3; // expected-error {{pointer to function type 'f' (aka 'int (void)') may not be 'restrict' qualified}}
 f *__restrict__ v4;   // expected-error {{pointer to function type 'f' (aka 'int (void)') may not be 'restrict' qualified}}
diff --git a/clang/test/SemaOpenACC/compute-construct-default-clause.c b/clang/test/SemaOpenACC/compute-construct-default-clause.c
index be6a705..a8d0c2b 100644
--- a/clang/test/SemaOpenACC/compute-construct-default-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-default-clause.c
@@ -23,18 +23,18 @@ void SingleOnly() {
   // expected-warning@+2{{OpenACC clause 'default' not yet implemented}}
   // expected-warning@+1{{OpenACC clause 'copy' not yet implemented}}
   #pragma acc parallel loop self default(present) private(i) default(none) copy(i)
-  while(0);
+  for(int i = 0; i < 5; ++i);
 
   // expected-warning@+3{{OpenACC clause 'self' not yet implemented, clause ignored}}
   // expected-warning@+2{{OpenACC construct 'serial loop' not yet implemented}}
   // expected-error@+1{{expected '('}}
   #pragma acc serial loop self default private(i) default(none) if(i)
-  while(0);
+  for(int i = 0; i < 5; ++i);
 
   // expected-warning@+2{{OpenACC construct 'kernels loop' not yet implemented}}
   // expected-warning@+1{{OpenACC clause 'default' not yet implemented}}
   #pragma acc kernels loop default(none)
-  while(0);
+  for(int i = 0; i < 5; ++i);
 
   // expected-warning@+2{{OpenACC construct 'data' not yet implemented}}
   // expected-warning@+1{{OpenACC clause 'default' not yet implemented}}
diff --git a/clang/test/SemaOpenACC/compute-construct-if-clause.c b/clang/test/SemaOpenACC/compute-construct-if-clause.c
index 41a929e..2fbf49f 100644
--- a/clang/test/SemaOpenACC/compute-construct-if-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-if-clause.c
@@ -50,15 +50,15 @@ void BoolExpr(int *I, float *F) {
   // expected-warning@+2{{OpenACC construct 'parallel loop' not yet implemented}}
   // expected-warning@+1{{OpenACC clause 'if' not yet implemented}}
 #pragma acc parallel loop if (*I < *F)
-  while(0);
+  for(int i = 0; i < 5; ++i);
   // expected-warning@+2{{OpenACC construct 'serial loop' not yet implemented}}
   // expected-warning@+1{{OpenACC clause 'if' not yet implemented}}
 #pragma acc serial loop if (*I < *F)
-  while(0);
+  for(int i = 0; i < 5; ++i);
   // expected-warning@+2{{OpenACC construct 'kernels loop' not yet implemented}}
   // expected-warning@+1{{OpenACC clause 'if' not yet implemented}}
 #pragma acc kernels loop if (*I < *F)
-  while(0);
+  for(int i = 0; i < 5; ++i);
 
   // expected-error@+1{{OpenACC 'if' clause is not valid on 'loop' directive}}
 #pragma acc loop if(I)
diff --git a/clang/test/SemaOpenACC/loop-ast.cpp b/clang/test/SemaOpenACC/loop-ast.cpp
index cc8bbfa..d8ecef9 100644
--- a/clang/test/SemaOpenACC/loop-ast.cpp
+++ b/clang/test/SemaOpenACC/loop-ast.cpp
@@ -42,12 +42,12 @@ void NormalFunc() {
   // CHECK-NEXT: CompoundStmt
   {
 #pragma acc parallel
-    // CHECK-NEXT: OpenACCComputeConstruct [[PAR_ADDR:[0-9a-fx]+]] {{.*}}parallel
+    // CHECK-NEXT: OpenACCComputeConstruct {{.*}}parallel
     // CHECK-NEXT: CompoundStmt
     {
 #pragma acc loop
       for(int i = 0; i < 5;++i);
-    // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: [[PAR_ADDR]]
+    // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
     // CHECK-NEXT: ForStmt
     // CHECK-NEXT: DeclStmt
     // CHECK-NEXT: VarDecl {{.*}} used i 'int'
@@ -91,16 +91,16 @@ void TemplFunc() {
 
   }
 
-#pragma acc parallel
+#pragma acc serial
   {
-    // CHECK-NEXT: OpenACCComputeConstruct {{.*}}parallel
+    // CHECK-NEXT: OpenACCComputeConstruct {{.*}}serial
     // CHECK-NEXT: CompoundStmt
 #pragma acc parallel
     {
-    // CHECK-NEXT: OpenACCComputeConstruct [[PAR_ADDR_UNINST:[0-9a-fx]+]] {{.*}}parallel
+    // CHECK-NEXT: OpenACCComputeConstruct {{.*}}parallel
     // CHECK-NEXT: CompoundStmt
 #pragma acc loop
-    // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: [[PAR_ADDR_UNINST]]
+    // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
     // CHECK-NEXT: ForStmt
     // CHECK-NEXT: DeclStmt
     // CHECK-NEXT: VarDecl {{.*}} i 'int'
@@ -116,7 +116,7 @@ void TemplFunc() {
       for(int i = 0; i < 5;++i);
 
 #pragma acc loop
-    // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: [[PAR_ADDR_UNINST]]
+    // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
     // CHECK-NEXT: ForStmt
     // CHECK-NEXT: DeclStmt
     // CHECK-NEXT: VarDecl {{.*}} i 'int'
@@ -166,13 +166,13 @@ void TemplFunc() {
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl{{.*}} I 'typename S::type':'int'
 
-  // CHECK-NEXT: OpenACCComputeConstruct {{.*}}parallel
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}}serial
   // CHECK-NEXT: CompoundStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct [[PAR_ADDR_INST:[0-9a-fx]+]] {{.*}}parallel
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}}parallel
   // CHECK-NEXT: CompoundStmt
 
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: [[PAR_ADDR_INST]]
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl {{.*}} i 'int'
@@ -186,7 +186,7 @@ void TemplFunc() {
   // CHECK-NEXT: DeclRefExpr{{.*}} 'i' 'int'
   // CHECK-NEXT: NullStmt
 
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: [[PAR_ADDR_INST]]
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl {{.*}} i 'int'
diff --git a/clang/test/SemaOpenACC/loop-construct-gang-ast.cpp b/clang/test/SemaOpenACC/loop-construct-gang-ast.cpp
index 738fe7c..9ee2faa 100644
--- a/clang/test/SemaOpenACC/loop-construct-gang-ast.cpp
+++ b/clang/test/SemaOpenACC/loop-construct-gang-ast.cpp
@@ -53,8 +53,8 @@ void NormalUses() {
 #pragma acc loop gang(static:Val)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: gang clause num
   // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
   // CHECK-NEXT: gang clause static
@@ -76,8 +76,8 @@ void NormalUses() {
 #pragma acc loop gang(num:1) gang(static:Val)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: gang clause dim static
   // CHECK-NEXT: ConstantExpr{{.*}} 'int'
   // CHECK-NEXT: value: Int 1
@@ -100,8 +100,8 @@ void NormalUses() {
 #pragma acc loop gang(dim:1, static:Val)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: gang clause static
   // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}} 'Val' 'int'
@@ -121,8 +121,8 @@ void NormalUses() {
 #pragma acc loop gang(static:Val)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}}serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: gang clause static
   // CHECK-NEXT: OpenACCAsteriskSizeExpr
   // CHECK-NEXT: ForStmt
@@ -141,8 +141,8 @@ void NormalUses() {
 #pragma acc loop gang(static:*)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: gang clause
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -224,8 +224,8 @@ void TemplateUses(T Val) {
 #pragma acc loop gang(static:*)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: gang clause dim
   // CHECK-NEXT: DeclRefExpr{{.*}}'unsigned int' NonTypeTemplateParm{{.*}} 'One' 'unsigned int'
   // CHECK-NEXT: gang clause static
@@ -246,8 +246,8 @@ void TemplateUses(T Val) {
 #pragma acc loop gang(dim:One) gang(static:Val)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: gang clause dim static
   // CHECK-NEXT: DeclRefExpr{{.*}}'unsigned int' NonTypeTemplateParm{{.*}} 'One' 'unsigned int'
   // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 'Val' 'T'
@@ -267,8 +267,8 @@ void TemplateUses(T Val) {
 #pragma acc loop gang(dim:One, static:Val)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: gang clause static
   // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 'Val' 'T'
   // CHECK-NEXT: ForStmt
@@ -287,8 +287,8 @@ void TemplateUses(T Val) {
 #pragma acc loop gang(static:Val)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: gang clause
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -367,8 +367,8 @@ void TemplateUses(T Val) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: gang clause dim
   // CHECK-NEXT: ConstantExpr{{.*}} 'unsigned int'
   // CHECK-NEXT: value: Int 1
@@ -391,8 +391,8 @@ void TemplateUses(T Val) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: gang clause dim static
   // CHECK-NEXT: ConstantExpr{{.*}} 'unsigned int'
   // CHECK-NEXT: value: Int 1
@@ -414,8 +414,8 @@ void TemplateUses(T Val) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: gang clause static
   // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 'Val' 'int'
@@ -432,8 +432,8 @@ void TemplateUses(T Val) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: gang clause
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
diff --git a/clang/test/SemaOpenACC/loop-construct-vector-ast.cpp b/clang/test/SemaOpenACC/loop-construct-vector-ast.cpp
index cbf5ac9..b841beb 100644
--- a/clang/test/SemaOpenACC/loop-construct-vector-ast.cpp
+++ b/clang/test/SemaOpenACC/loop-construct-vector-ast.cpp
@@ -70,8 +70,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
 #pragma acc loop vector(length:CTI)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: DeclRefExpr{{.*}}'Int' lvalue ParmVar{{.*}}'IsI' 'Int'
   // CHECK-NEXT: ForStmt
@@ -90,8 +90,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
 #pragma acc loop vector(length:IsI)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -109,8 +109,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
 #pragma acc loop vector
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: DeclRefExpr{{.*}}'Int' lvalue ParmVar{{.*}}'IsI' 'Int'
   // CHECK-NEXT: ForStmt
@@ -194,8 +194,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}}'IsI' 'int'
@@ -212,8 +212,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -228,8 +228,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}}'IsI' 'int'
@@ -330,8 +330,8 @@ void uses() {
 #pragma acc loop vector(length:C)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -349,8 +349,8 @@ void uses() {
 #pragma acc loop vector
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <UserDefinedConversion>
   // CHECK-NEXT: CXXMemberCallExpr{{.*}}'int'
@@ -372,8 +372,8 @@ void uses() {
 #pragma acc loop vector(C)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var
@@ -393,8 +393,8 @@ void uses() {
 #pragma acc loop vector(length:i)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -412,8 +412,8 @@ void uses() {
 #pragma acc loop vector
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <UserDefinedConversion>
   // CHECK-NEXT: CXXMemberCallExpr{{.*}}'int'
@@ -435,8 +435,8 @@ void uses() {
 #pragma acc loop vector(C)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var
@@ -456,8 +456,8 @@ void uses() {
 #pragma acc loop vector(length:i)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: vector clause
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
diff --git a/clang/test/SemaOpenACC/loop-construct-worker-ast.cpp b/clang/test/SemaOpenACC/loop-construct-worker-ast.cpp
index d5c14e2..9fbe9af 100644
--- a/clang/test/SemaOpenACC/loop-construct-worker-ast.cpp
+++ b/clang/test/SemaOpenACC/loop-construct-worker-ast.cpp
@@ -34,8 +34,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
 #pragma acc loop worker
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -53,8 +53,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
 #pragma acc loop worker
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -72,8 +72,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
 #pragma acc loop worker
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: DeclRefExpr{{.*}} 'ConvertsToInt' lvalue ParmVar
   // CHECK-NEXT: ForStmt
@@ -92,8 +92,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
 #pragma acc loop worker(CTI)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: DeclRefExpr{{.*}} 'Int' lvalue ParmVar
   // CHECK-NEXT: ForStmt
@@ -112,8 +112,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
 #pragma acc loop worker(num:IsI)
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: DeclRefExpr{{.*}} 'unsigned int' NonTypeTemplateParm{{.*}}'I' 'unsigned int'
   // CHECK-NEXT: ForStmt
@@ -159,8 +159,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -175,8 +175,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -191,8 +191,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <UserDefinedConversion>
   // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'int'
@@ -211,8 +211,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'IsI' 'int'
@@ -229,8 +229,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) {
   // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int'
   // CHECK-NEXT: NullStmt
   //
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'unsigned int'
   // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}}'unsigned int' depth 0 index 0 I
@@ -277,8 +277,8 @@ void uses() {
 #pragma acc loop worker
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -296,8 +296,8 @@ void uses() {
 #pragma acc loop worker
   for(int i = 0; i < 5; ++i);
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: ForStmt
   // CHECK-NEXT: DeclStmt
@@ -320,8 +320,8 @@ void uses() {
   // CHECK-NEXT: VarDecl
   // CHECK-NEXT: CXXConstructExpr
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <UserDefinedConversion>
   // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'int'
@@ -347,8 +347,8 @@ void uses() {
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl
 
-  // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels
-  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]]
+  // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels
+  // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels
   // CHECK-NEXT: worker clause{{.*}}
   // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var
diff --git a/clang/utils/perf-training/bolt.lit.cfg b/clang/utils/perf-training/bolt.lit.cfg
index 0e81a550..1d0cf9a 100644
--- a/clang/utils/perf-training/bolt.lit.cfg
+++ b/clang/utils/perf-training/bolt.lit.cfg
@@ -49,3 +49,6 @@ config.substitutions.append(("%clang_cpp", f" {config.clang} --driver-mode=g++ "
 config.substitutions.append(("%clang_skip_driver", config.clang))
 config.substitutions.append(("%clang", config.clang))
 config.substitutions.append(("%test_root", config.test_exec_root))
+config.substitutions.append(('%cmake_generator', config.cmake_generator))
+config.substitutions.append(('%cmake', config.cmake_exe))
+config.substitutions.append(('%llvm_src_dir', config.llvm_src_dir))
diff --git a/clang/utils/perf-training/bolt.lit.site.cfg.in b/clang/utils/perf-training/bolt.lit.site.cfg.in
index 54de127..3de5026 100644
--- a/clang/utils/perf-training/bolt.lit.site.cfg.in
+++ b/clang/utils/perf-training/bolt.lit.site.cfg.in
@@ -11,6 +11,9 @@ config.python_exe = "@Python3_EXECUTABLE@"
 config.clang_obj_root = path(r"@CLANG_BINARY_DIR@")
 config.clang_bolt_mode = "@CLANG_BOLT@"
 config.clang_bolt_name = "@CLANG_BOLT_INSTRUMENTED@"
+config.cmake_exe = "@CMAKE_COMMAND@"
+config.llvm_src_dir ="@CMAKE_SOURCE_DIR@"
+config.cmake_generator ="@CMAKE_GENERATOR@"
 
 # Let the main config do the real work.
 lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/bolt.lit.cfg")
diff --git a/clang/utils/perf-training/lit.cfg b/clang/utils/perf-training/lit.cfg
index 0bd06c0..654961e 100644
--- a/clang/utils/perf-training/lit.cfg
+++ b/clang/utils/perf-training/lit.cfg
@@ -34,8 +34,11 @@ config.test_format = lit.formats.ShTest(use_lit_shell == "0")
 config.substitutions.append( ('%clang_cpp_skip_driver', ' %s %s %s ' % (cc1_wrapper, config.clang, sysroot_flags)))
 config.substitutions.append( ('%clang_cpp', ' %s --driver-mode=g++ %s ' % (config.clang, sysroot_flags)))
 config.substitutions.append( ('%clang_skip_driver', ' %s %s %s ' % (cc1_wrapper, config.clang, sysroot_flags)))
-config.substitutions.append( ('%clang', ' %s %s ' % (config.clang, sysroot_flags) ) )
+config.substitutions.append( ('%clang', '%s %s ' % (config.clang, sysroot_flags) ) )
 config.substitutions.append( ('%test_root', config.test_exec_root ) )
+config.substitutions.append( ('%cmake_generator', config.cmake_generator ) )
+config.substitutions.append( ('%cmake', config.cmake_exe ) )
+config.substitutions.append( ('%llvm_src_dir', config.llvm_src_dir ) )
 
 config.environment['LLVM_PROFILE_FILE'] = 'perf-training-%4m.profraw'
 
diff --git a/clang/utils/perf-training/lit.site.cfg.in b/clang/utils/perf-training/lit.site.cfg.in
index fae9306..9d279d5 100644
--- a/clang/utils/perf-training/lit.site.cfg.in
+++ b/clang/utils/perf-training/lit.site.cfg.in
@@ -8,6 +8,9 @@ config.test_exec_root = "@CMAKE_CURRENT_BINARY_DIR@"
 config.test_source_root = "@CLANG_PGO_TRAINING_DATA@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.python_exe = "@Python3_EXECUTABLE@"
+config.cmake_exe = "@CMAKE_COMMAND@"
+config.llvm_src_dir ="@CMAKE_SOURCE_DIR@"
+config.cmake_generator ="@CMAKE_GENERATOR@"
 
 # Let the main config do the real work.
 lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/lit.cfg")
diff --git a/clang/utils/perf-training/llvm-support/build.test b/clang/utils/perf-training/llvm-support/build.test
new file mode 100644
index 0000000..f29a594
--- /dev/null
+++ b/clang/utils/perf-training/llvm-support/build.test
@@ -0,0 +1,2 @@
+RUN: %cmake -G %cmake_generator -B %t -S %llvm_src_dir -DCMAKE_C_COMPILER=%clang -DCMAKE_CXX_COMPILER=%clang -DCMAKE_CXX_FLAGS="--driver-mode=g++" -DCMAKE_BUILD_TYPE=Release
+RUN: %cmake --build %t -v --target LLVMSupport
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index 989a572..793e700 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -191,12 +191,12 @@ conformance.</p>
     <tr>
       <td>Slay Some Earthly Demons III</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3341.pdf">N3341</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="full" align="center">Yes</td>
     </tr>
     <tr>
       <td>Slay Some Earthly Demons IV</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3342.pdf">N3342</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="full" align="center">Yes</td>
     </tr>
     <tr>
       <td>Slay Some Earthly Demons VI</td>
@@ -211,7 +211,7 @@ conformance.</p>
     <tr>
       <td>Slay Some Earthly Demons VIII</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3346.pdf">N3346</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="full" align="center">Yes</td>
     </tr>
     <tr>
       <td>Introduce complex literals v. 2</td>
diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
index 6b96a51..4873bec 100644
--- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake
+++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
@@ -713,6 +713,9 @@ macro(add_custom_libcxx name prefix)
     USES_TERMINAL_INSTALL 1
     LIST_SEPARATOR |
     EXCLUDE_FROM_ALL TRUE
+    # Ensure that DESTDIR=... set in the out environment does not affect this
+    # target (we always need to install to the build directory).
+    INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build ${prefix}/build --target install
     INSTALL_BYPRODUCTS "${prefix}/lib/libc++.a" "${prefix}/lib/libc++abi.a"
     )
 
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 456d533..03d756e 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -177,6 +177,7 @@ public:
   NODE(parser, Call)
   NODE(parser, CallStmt)
   NODE(CallStmt, Chevrons)
+  NODE(CallStmt, StarOrExpr)
   NODE(parser, CaseConstruct)
   NODE(CaseConstruct, Case)
   NODE(parser, CaseSelector)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index e851874..8ba7ca2 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3247,13 +3247,14 @@ struct FunctionReference {
 
 // R1521 call-stmt -> CALL procedure-designator [ chevrons ]
 //         [( [actual-arg-spec-list] )]
-// (CUDA) chevrons -> <<< scalar-expr, scalar-expr [,
+// (CUDA) chevrons -> <<< * | scalar-expr, scalar-expr [,
 //          scalar-int-expr [, scalar-int-expr ] ] >>>
 struct CallStmt {
   BOILERPLATE(CallStmt);
+  WRAPPER_CLASS(StarOrExpr, std::optional<ScalarExpr>);
   struct Chevrons {
     TUPLE_CLASS_BOILERPLATE(Chevrons);
-    std::tuple<ScalarExpr, ScalarExpr, std::optional<ScalarIntExpr>,
+    std::tuple<StarOrExpr, ScalarExpr, std::optional<ScalarIntExpr>,
         std::optional<ScalarIntExpr>>
         t;
   };
@@ -3694,9 +3695,8 @@ struct OmpMapClause {
 
 // 2.9.5 order-clause -> ORDER ([order-modifier :]concurrent)
 struct OmpOrderModifier {
-  UNION_CLASS_BOILERPLATE(OmpOrderModifier);
   ENUM_CLASS(Kind, Reproducible, Unconstrained)
-  std::variant<Kind> u;
+  WRAPPER_CLASS_BOILERPLATE(OmpOrderModifier, Kind);
 };
 
 struct OmpOrderClause {
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 46caafe..412b0e7 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -1092,8 +1092,7 @@ Order make(const parser::OmpClause::Order &inp,
   auto &t1 = std::get<wrapped::Type>(inp.v.t);
 
   auto convert3 = [&](const parser::OmpOrderModifier &s) {
-    return Fortran::common::visit(
-        [&](parser::OmpOrderModifier::Kind k) { return convert1(k); }, s.u);
+    return convert1(s.v);
   };
   return Order{
       {/*OrderModifier=*/maybeApply(convert3, t0), /*Ordering=*/convert2(t1)}};
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 881f541..8e9de3d 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -337,6 +337,13 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
                                                  seqTy.getConstantArraySize());
         }
         bytes = rewriter.create<mlir::arith::MulIOp>(loc, nbElem, width);
+      } else if (fir::isa_derived(op.getInType())) {
+        mlir::Type structTy = typeConverter->convertType(op.getInType());
+        std::size_t structSize = dl->getTypeSizeInBits(structTy) / 8;
+        bytes = builder.createIntegerConstant(loc, builder.getIndexType(),
+                                              structSize);
+      } else {
+        mlir::emitError(loc, "unsupported type in cuf.alloc\n");
       }
       mlir::func::FuncOp func =
           fir::runtime::getRuntimeFunc<mkRTKey(CUFMemAlloc)>(loc, builder);
diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp
index 2b7da18..e365cd2 100644
--- a/flang/lib/Parser/program-parsers.cpp
+++ b/flang/lib/Parser/program-parsers.cpp
@@ -474,10 +474,13 @@ TYPE_CONTEXT_PARSER("function reference"_en_US,
 
 // R1521 call-stmt -> CALL procedure-designator [chevrons]
 ///                          [( [actual-arg-spec-list] )]
-// (CUDA) chevrons -> <<< scalar-expr, scalar-expr [, scalar-int-expr
+// (CUDA) chevrons -> <<< * | scalar-expr, scalar-expr [, scalar-int-expr
 //                      [, scalar-int-expr ] ] >>>
+constexpr auto starOrExpr{
+    construct<CallStmt::StarOrExpr>("*" >> pure<std::optional<ScalarExpr>>() ||
+        applyFunction(presentOptional<ScalarExpr>, scalarExpr))};
 TYPE_PARSER(extension<LanguageFeature::CUDA>(
-    "<<<" >> construct<CallStmt::Chevrons>(scalarExpr, "," >> scalarExpr,
+    "<<<" >> construct<CallStmt::Chevrons>(starOrExpr, ", " >> scalarExpr,
                  maybe("," >> scalarIntExpr), maybe("," >> scalarIntExpr)) /
         ">>>"))
 constexpr auto actualArgSpecList{optionalList(actualArgSpec)};
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index bbb126d..5d70f34 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -1703,6 +1703,13 @@ public:
   void Unparse(const IntrinsicStmt &x) { // R1519
     Word("INTRINSIC :: "), Walk(x.v, ", ");
   }
+  void Unparse(const CallStmt::StarOrExpr &x) {
+    if (x.v) {
+      Walk(*x.v);
+    } else {
+      Word("*");
+    }
+  }
   void Unparse(const CallStmt::Chevrons &x) { // CUDA
     Walk(std::get<0>(x.t)); // grid
     Word(","), Walk(std::get<1>(x.t)); // block
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index c70c8a8..ead9982 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -3066,11 +3066,17 @@ std::optional<Chevrons> ExpressionAnalyzer::AnalyzeChevrons(
     return false;
   }};
   if (const auto &chevrons{call.chevrons}) {
-    if (auto expr{Analyze(std::get<0>(chevrons->t))};
-        expr && checkLaunchArg(*expr, "grid")) {
-      result.emplace_back(*expr);
+    auto &starOrExpr{std::get<0>(chevrons->t)};
+    if (starOrExpr.v) {
+      if (auto expr{Analyze(*starOrExpr.v)};
+          expr && checkLaunchArg(*expr, "grid")) {
+        result.emplace_back(*expr);
+      } else {
+        return std::nullopt;
+      }
     } else {
-      return std::nullopt;
+      result.emplace_back(
+          AsGenericExpr(evaluate::Constant<evaluate::CInteger>{-1}));
     }
     if (auto expr{Analyze(std::get<1>(chevrons->t))};
         expr && checkLaunchArg(*expr, "block")) {
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp
index abb7ebb..88cdf3c 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang/runtime/CUDA/kernel.cpp
@@ -25,6 +25,55 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
   blockDim.x = blockX;
   blockDim.y = blockY;
   blockDim.z = blockZ;
+  unsigned nbNegGridDim{0};
+  if (gridX < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridY < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridZ < 0) {
+    ++nbNegGridDim;
+  }
+  if (nbNegGridDim == 1) {
+    int maxBlocks, nbBlocks, dev, multiProcCount;
+    cudaError_t err1, err2;
+    nbBlocks = blockDim.x * blockDim.y * blockDim.z;
+    cudaGetDevice(&dev);
+    err1 = cudaDeviceGetAttribute(
+        &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+    err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &maxBlocks, kernel, nbBlocks, smem);
+    if (err1 == cudaSuccess && err2 == cudaSuccess) {
+      maxBlocks = multiProcCount * maxBlocks;
+    }
+    if (maxBlocks > 0) {
+      if (gridDim.x > 0) {
+        maxBlocks = maxBlocks / gridDim.x;
+      }
+      if (gridDim.y > 0) {
+        maxBlocks = maxBlocks / gridDim.y;
+      }
+      if (gridDim.z > 0) {
+        maxBlocks = maxBlocks / gridDim.z;
+      }
+      if (maxBlocks < 1) {
+        maxBlocks = 1;
+      }
+      if (gridX < 0) {
+        gridDim.x = maxBlocks;
+      }
+      if (gridY < 0) {
+        gridDim.y = maxBlocks;
+      }
+      if (gridZ < 0) {
+        gridDim.z = maxBlocks;
+      }
+    }
+  } else if (nbNegGridDim > 1) {
+    Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+    terminator.Crash("Too many invalid grid dimensions");
+  }
   cudaStream_t stream = 0; // TODO stream managment
   CUDA_REPORT_IF_ERROR(
       cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, stream));
@@ -41,6 +90,55 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
   config.blockDim.x = blockX;
   config.blockDim.y = blockY;
   config.blockDim.z = blockZ;
+  unsigned nbNegGridDim{0};
+  if (gridX < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridY < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridZ < 0) {
+    ++nbNegGridDim;
+  }
+  if (nbNegGridDim == 1) {
+    int maxBlocks, nbBlocks, dev, multiProcCount;
+    cudaError_t err1, err2;
+    nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z;
+    cudaGetDevice(&dev);
+    err1 = cudaDeviceGetAttribute(
+        &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+    err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &maxBlocks, kernel, nbBlocks, smem);
+    if (err1 == cudaSuccess && err2 == cudaSuccess) {
+      maxBlocks = multiProcCount * maxBlocks;
+    }
+    if (maxBlocks > 0) {
+      if (config.gridDim.x > 0) {
+        maxBlocks = maxBlocks / config.gridDim.x;
+      }
+      if (config.gridDim.y > 0) {
+        maxBlocks = maxBlocks / config.gridDim.y;
+      }
+      if (config.gridDim.z > 0) {
+        maxBlocks = maxBlocks / config.gridDim.z;
+      }
+      if (maxBlocks < 1) {
+        maxBlocks = 1;
+      }
+      if (gridX < 0) {
+        config.gridDim.x = maxBlocks;
+      }
+      if (gridY < 0) {
+        config.gridDim.y = maxBlocks;
+      }
+      if (gridZ < 0) {
+        config.gridDim.z = maxBlocks;
+      }
+    }
+  } else if (nbNegGridDim > 1) {
+    Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+    terminator.Crash("Too many invalid grid dimensions");
+  }
   config.dynamicSmemBytes = smem;
   config.stream = 0; // TODO stream managment
   cudaLaunchAttribute launchAttr[1];
diff --git a/flang/test/Fir/CUDA/cuda-alloc-free.fir b/flang/test/Fir/CUDA/cuda-alloc-free.fir
index 2582141..88b1a00 100644
--- a/flang/test/Fir/CUDA/cuda-alloc-free.fir
+++ b/flang/test/Fir/CUDA/cuda-alloc-free.fir
@@ -61,4 +61,16 @@ func.func @_QPsub3(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref<
 // CHECK: %{{.*}} = fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
 // CHECK: fir.call @_FortranACUFMemFree
 
+func.func @_QPtest_type() {
+  %0 = cuf.alloc !fir.type<_QMbarTcmplx{id:i32,c:complex<f32>}> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_typeEa"} -> !fir.ref<!fir.type<_QMbarTcmplx{id:i32,c:complex<f32>}>>
+  %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_typeEa"} : (!fir.ref<!fir.type<_QMbarTcmplx{id:i32,c:complex<f32>}>>) -> !fir.ref<!fir.type<_QMbarTcmplx{id:i32,c:complex<f32>}>>
+  cuf.free %1 : !fir.ref<!fir.type<_QMbarTcmplx{id:i32,c:complex<f32>}>> {data_attr = #cuf.cuda<device>}
+  return
+}
+
+// CHECK-LABEL: func.func @_QPtest_type()
+// CHECK: %[[BYTES:.*]] = arith.constant 12 : index
+// CHECK: %[[CONV_BYTES:.*]] = fir.convert %[[BYTES]] : (index) -> i64
+// CHECK: fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+
 } // end module
diff --git a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
index 82d1a61..eb24138 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
@@ -47,7 +47,10 @@ contains
 ! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %c0{{.*}}>>>()
 
     call dev_kernel1<<<1, 32>>>(a)
-! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%1#1) : (!fir.ref<f32>)
+! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%{{.*}}) : (!fir.ref<f32>)
+
+    call dev_kernel1<<<*, 32>>>(a)
+! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c-1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%{{.*}})
   end
 
 end
diff --git a/flang/test/Parser/cuf-sanity-common b/flang/test/Parser/cuf-sanity-common
index 9341f05..7005ef0 100644
--- a/flang/test/Parser/cuf-sanity-common
+++ b/flang/test/Parser/cuf-sanity-common
@@ -40,6 +40,7 @@ module m
     call globalsub<<<1, 2>>>
     call globalsub<<<1, 2, 3>>>
     call globalsub<<<1, 2, 3, 4>>>
+    call globalsub<<<*,5>>>
     allocate(pa(32), pinned = isPinned)
   end subroutine
 end module
diff --git a/flang/test/Parser/cuf-sanity-tree.CUF b/flang/test/Parser/cuf-sanity-tree.CUF
index 2820441..a8b2f93 100644
--- a/flang/test/Parser/cuf-sanity-tree.CUF
+++ b/flang/test/Parser/cuf-sanity-tree.CUF
@@ -166,7 +166,7 @@ include "cuf-sanity-common"
 !CHECK: | | | | | Call
 !CHECK: | | | | | | ProcedureDesignator -> Name = 'globalsub'
 !CHECK: | | | | | Chevrons
-!CHECK: | | | | | | Scalar -> Expr = '1_4'
+!CHECK: | | | | | | StarOrExpr -> Scalar -> Expr = '1_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
 !CHECK: | | | | | | Scalar -> Expr = '2_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
@@ -174,7 +174,7 @@ include "cuf-sanity-common"
 !CHECK: | | | | | Call
 !CHECK: | | | | | | ProcedureDesignator -> Name = 'globalsub'
 !CHECK: | | | | | Chevrons
-!CHECK: | | | | | | Scalar -> Expr = '1_4'
+!CHECK: | | | | | | StarOrExpr -> Scalar -> Expr = '1_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
 !CHECK: | | | | | | Scalar -> Expr = '2_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
@@ -184,7 +184,7 @@ include "cuf-sanity-common"
 !CHECK: | | | | | Call
 !CHECK: | | | | | | ProcedureDesignator -> Name = 'globalsub'
 !CHECK: | | | | | Chevrons
-!CHECK: | | | | | | Scalar -> Expr = '1_4'
+!CHECK: | | | | | | StarOrExpr -> Scalar -> Expr = '1_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
 !CHECK: | | | | | | Scalar -> Expr = '2_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
diff --git a/flang/test/Parser/cuf-sanity-unparse.CUF b/flang/test/Parser/cuf-sanity-unparse.CUF
index d4be347..2e2df9a 100644
--- a/flang/test/Parser/cuf-sanity-unparse.CUF
+++ b/flang/test/Parser/cuf-sanity-unparse.CUF
@@ -43,6 +43,7 @@ include "cuf-sanity-common"
 !CHECK:    CALL globalsub<<<1_4,2_4>>>()
 !CHECK:    CALL globalsub<<<1_4,2_4,3_4>>>()
 !CHECK:    CALL globalsub<<<1_4,2_4,3_4,4_4>>>()
+!CHECK:    CALL globalsub<<<-1_4,5_4>>>()
 !CHECK:   ALLOCATE(pa(32_4), PINNED=ispinned)
 !CHECK:  END SUBROUTINE
 !CHECK: END MODULE
diff --git a/lldb/test/API/lua_api/TestFileHandle.lua b/lldb/test/API/lua_api/TestFileHandle.lua
index aed45be..8db5ccb 100644
--- a/lldb/test/API/lua_api/TestFileHandle.lua
+++ b/lldb/test/API/lua_api/TestFileHandle.lua
@@ -2,7 +2,8 @@ _T = require('lua_lldb_test').create_test('TestFileHandle')
 
 function _T:TestLegacyFileOutScript()
     local f = io.open(self.output, 'w')
-    self.debugger:SetOutputFile(f)
+    local sbf = lldb.SBFile(f)
+    self.debugger:SetOutputFile(sbf)
     self:handle_command('script print(1+1)')
     self.debugger:GetOutputFileHandle():write('FOO\n')
     self.debugger:GetOutputFileHandle():flush()
@@ -15,7 +16,8 @@ end
 
 function _T:TestLegacyFileOut()
     local f = io.open(self.output, 'w')
-    self.debugger:SetOutputFile(f)
+    local sbf = lldb.SBFile(f)
+    self.debugger:SetOutputFile(sbf)
     self:handle_command('expression/x 3735928559', false)
     f:close()
 
@@ -26,8 +28,10 @@ end
 
 function _T:TestLegacyFileErr()
     local f = io.open(self.output, 'w')
-    self.debugger:SetErrorFile(f)
+    local sbf = lldb.SBFile(f)
+    self.debugger:SetErrorFile(sbf)
     self:handle_command('lol', false)
+    f:close()
 
     f = io.open(self.output, 'r')
     assertStrContains(f:read('*l'), 'is not a valid command')
diff --git a/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s b/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s
index 191b55c..685d0a8 100644
--- a/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s
+++ b/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s
@@ -55,7 +55,7 @@ fn:
   mrs x2, ssbs                          // AEK_SSBS
   abs z31.h, p7/m, z31.h                // AEK_SVE
   sqdmlslbt z0.d, z1.s, z31.s           // AEK_SVE2
-  aesd z0.b, z0.b, z31.b                // AEK_SVEAES
+  aesd z0.b, z0.b, z31.b                // AEK_SVE2AES
   bdep z0.b, z1.b, z31.b                // AEK_SVE2BITPERM
   rax1 z0.d, z0.d, z0.d                 // AEK_SVE2SHA3
   sm4e z0.s, z0.s, z0.s                 // AEK_SVE2SM4
diff --git a/lldb/tools/lldb-dap/Breakpoint.cpp b/lldb/tools/lldb-dap/Breakpoint.cpp
index 9ea7a42..b3bfa61 100644
--- a/lldb/tools/lldb-dap/Breakpoint.cpp
+++ b/lldb/tools/lldb-dap/Breakpoint.cpp
@@ -7,10 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "Breakpoint.h"
-#include "DAP.h"
 #include "JSONUtils.h"
+#include "lldb/API/SBAddress.h"
 #include "lldb/API/SBBreakpointLocation.h"
+#include "lldb/API/SBLineEntry.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/JSON.h"
+#include <cstddef>
+#include <cstdint>
+#include <string>
 
 using namespace lldb_dap;
 
@@ -51,7 +56,7 @@ void Breakpoint::CreateJsonObject(llvm::json::Object &object) {
 
   if (bp_addr.IsValid()) {
     std::string formatted_addr =
-        "0x" + llvm::utohexstr(bp_addr.GetLoadAddress(g_dap.target));
+        "0x" + llvm::utohexstr(bp_addr.GetLoadAddress(bp.GetTarget()));
     object.try_emplace("instructionReference", formatted_addr);
     auto line_entry = bp_addr.GetLineEntry();
     const auto line = line_entry.GetLine();
diff --git a/lldb/tools/lldb-dap/Breakpoint.h b/lldb/tools/lldb-dap/Breakpoint.h
index ee9d373..a726f27 100644
--- a/lldb/tools/lldb-dap/Breakpoint.h
+++ b/lldb/tools/lldb-dap/Breakpoint.h
@@ -10,6 +10,7 @@
 #define LLDB_TOOLS_LLDB_DAP_BREAKPOINT_H
 
 #include "BreakpointBase.h"
+#include "DAPForward.h"
 #include "lldb/API/SBBreakpoint.h"
 
 namespace lldb_dap {
@@ -18,9 +19,8 @@ struct Breakpoint : public BreakpointBase {
   // The LLDB breakpoint associated wit this source breakpoint
   lldb::SBBreakpoint bp;
 
-  Breakpoint() = default;
-  Breakpoint(const llvm::json::Object &obj) : BreakpointBase(obj){};
-  Breakpoint(lldb::SBBreakpoint bp) : bp(bp) {}
+  Breakpoint(DAP &d, const llvm::json::Object &obj) : BreakpointBase(d, obj) {}
+  Breakpoint(DAP &d, lldb::SBBreakpoint bp) : BreakpointBase(d), bp(bp) {}
 
   void SetCondition() override;
   void SetHitCondition() override;
diff --git a/lldb/tools/lldb-dap/BreakpointBase.cpp b/lldb/tools/lldb-dap/BreakpointBase.cpp
index f3cb06a..1e28c29 100644
--- a/lldb/tools/lldb-dap/BreakpointBase.cpp
+++ b/lldb/tools/lldb-dap/BreakpointBase.cpp
@@ -8,11 +8,12 @@
 
 #include "BreakpointBase.h"
 #include "JSONUtils.h"
+#include "llvm/ADT/StringRef.h"
 
 using namespace lldb_dap;
 
-BreakpointBase::BreakpointBase(const llvm::json::Object &obj)
-    : condition(std::string(GetString(obj, "condition"))),
+BreakpointBase::BreakpointBase(DAP &d, const llvm::json::Object &obj)
+    : dap(d), condition(std::string(GetString(obj, "condition"))),
       hitCondition(std::string(GetString(obj, "hitCondition"))) {}
 
 void BreakpointBase::UpdateBreakpoint(const BreakpointBase &request_bp) {
diff --git a/lldb/tools/lldb-dap/BreakpointBase.h b/lldb/tools/lldb-dap/BreakpointBase.h
index 7930148..3c248dd 100644
--- a/lldb/tools/lldb-dap/BreakpointBase.h
+++ b/lldb/tools/lldb-dap/BreakpointBase.h
@@ -9,12 +9,14 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_BREAKPOINTBASE_H
 #define LLDB_TOOLS_LLDB_DAP_BREAKPOINTBASE_H
 
-#include "llvm/Support/JSON.h"
+#include "DAPForward.h"
 #include <string>
 
 namespace lldb_dap {
 
 struct BreakpointBase {
+  // Associated DAP session.
+  DAP &dap;
 
   // An optional expression for conditional breakpoints.
   std::string condition;
@@ -22,8 +24,8 @@ struct BreakpointBase {
   // ignored. The backend is expected to interpret the expression as needed
   std::string hitCondition;
 
-  BreakpointBase() = default;
-  BreakpointBase(const llvm::json::Object &obj);
+  explicit BreakpointBase(DAP &d) : dap(d) {}
+  BreakpointBase(DAP &d, const llvm::json::Object &obj);
   virtual ~BreakpointBase() = default;
 
   virtual void SetCondition() = 0;
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 2833922..647e280 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -74,21 +74,21 @@ void DAP::PopulateExceptionBreakpoints() {
     exception_breakpoints = std::vector<ExceptionBreakpoint>{};
 
     if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) {
-      exception_breakpoints->emplace_back("cpp_catch", "C++ Catch",
+      exception_breakpoints->emplace_back(*this, "cpp_catch", "C++ Catch",
                                           lldb::eLanguageTypeC_plus_plus);
-      exception_breakpoints->emplace_back("cpp_throw", "C++ Throw",
+      exception_breakpoints->emplace_back(*this, "cpp_throw", "C++ Throw",
                                           lldb::eLanguageTypeC_plus_plus);
     }
     if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeObjC)) {
-      exception_breakpoints->emplace_back("objc_catch", "Objective-C Catch",
-                                          lldb::eLanguageTypeObjC);
-      exception_breakpoints->emplace_back("objc_throw", "Objective-C Throw",
-                                          lldb::eLanguageTypeObjC);
+      exception_breakpoints->emplace_back(
+          *this, "objc_catch", "Objective-C Catch", lldb::eLanguageTypeObjC);
+      exception_breakpoints->emplace_back(
+          *this, "objc_throw", "Objective-C Throw", lldb::eLanguageTypeObjC);
     }
     if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeSwift)) {
-      exception_breakpoints->emplace_back("swift_catch", "Swift Catch",
+      exception_breakpoints->emplace_back(*this, "swift_catch", "Swift Catch",
                                           lldb::eLanguageTypeSwift);
-      exception_breakpoints->emplace_back("swift_throw", "Swift Throw",
+      exception_breakpoints->emplace_back(*this, "swift_throw", "Swift Throw",
                                           lldb::eLanguageTypeSwift);
     }
     // Besides handling the hardcoded list of languages from above, we try to
@@ -119,7 +119,7 @@ void DAP::PopulateExceptionBreakpoints() {
             raw_throw_keyword ? raw_throw_keyword : "throw";
 
         exception_breakpoints->emplace_back(
-            raw_lang_name + "_" + throw_keyword,
+            *this, raw_lang_name + "_" + throw_keyword,
             capitalized_lang_name + " " + capitalize(throw_keyword), lang);
       }
 
@@ -130,7 +130,7 @@ void DAP::PopulateExceptionBreakpoints() {
             raw_catch_keyword ? raw_catch_keyword : "catch";
 
         exception_breakpoints->emplace_back(
-            raw_lang_name + "_" + catch_keyword,
+            *this, raw_lang_name + "_" + catch_keyword,
             capitalized_lang_name + " " + capitalize(catch_keyword), lang);
       }
     }
@@ -1060,7 +1060,7 @@ void DAP::SetThreadFormat(llvm::StringRef format) {
 InstructionBreakpoint *
 DAP::GetInstructionBreakpoint(const lldb::break_id_t bp_id) {
   for (auto &bp : instruction_breakpoints) {
-    if (bp.second.id == bp_id)
+    if (bp.second.bp.GetID() == bp_id)
       return &bp.second;
   }
   return nullptr;
diff --git a/lldb/tools/lldb-dap/DAPForward.h b/lldb/tools/lldb-dap/DAPForward.h
index 159d999..0196d83 100644
--- a/lldb/tools/lldb-dap/DAPForward.h
+++ b/lldb/tools/lldb-dap/DAPForward.h
@@ -9,6 +9,8 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_DAPFORWARD_H
 #define LLDB_TOOLS_LLDB_DAP_DAPFORWARD_H
 
+// IWYU pragma: begin_exports
+
 namespace lldb_dap {
 struct BreakpointBase;
 struct ExceptionBreakpoint;
@@ -16,6 +18,7 @@ struct FunctionBreakpoint;
 struct SourceBreakpoint;
 struct Watchpoint;
 struct InstructionBreakpoint;
+struct DAP;
 } // namespace lldb_dap
 
 namespace lldb {
@@ -35,6 +38,7 @@ class SBLanguageRuntime;
 class SBLaunchInfo;
 class SBLineEntry;
 class SBListener;
+class SBModule;
 class SBProcess;
 class SBStream;
 class SBStringList;
@@ -44,4 +48,12 @@ class SBValue;
 class SBWatchpoint;
 } // namespace lldb
 
+namespace llvm {
+namespace json {
+class Object;
+} // namespace json
+} // namespace llvm
+
+// IWYU pragma: end_exports
+
 #endif
diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
index 130c237..0fb865c 100644
--- a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
+++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
@@ -9,6 +9,7 @@
 #include "ExceptionBreakpoint.h"
 #include "BreakpointBase.h"
 #include "DAP.h"
+#include "lldb/API/SBTarget.h"
 
 namespace lldb_dap {
 
@@ -17,8 +18,8 @@ void ExceptionBreakpoint::SetBreakpoint() {
     return;
   bool catch_value = filter.find("_catch") != std::string::npos;
   bool throw_value = filter.find("_throw") != std::string::npos;
-  bp = g_dap.target.BreakpointCreateForException(language, catch_value,
-                                                 throw_value);
+  bp = dap.target.BreakpointCreateForException(language, catch_value,
+                                               throw_value);
   // See comments in BreakpointBase::GetBreakpointLabel() for details of why
   // we add a label to our breakpoints.
   bp.AddName(BreakpointBase::GetBreakpointLabel());
@@ -27,7 +28,7 @@ void ExceptionBreakpoint::SetBreakpoint() {
 void ExceptionBreakpoint::ClearBreakpoint() {
   if (!bp.IsValid())
     return;
-  g_dap.target.BreakpointDelete(bp.GetID());
+  dap.target.BreakpointDelete(bp.GetID());
   bp = lldb::SBBreakpoint();
 }
 
diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.h b/lldb/tools/lldb-dap/ExceptionBreakpoint.h
index 7b81d84..b83c5ef7 100644
--- a/lldb/tools/lldb-dap/ExceptionBreakpoint.h
+++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.h
@@ -9,21 +9,25 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_EXCEPTIONBREAKPOINT_H
 #define LLDB_TOOLS_LLDB_DAP_EXCEPTIONBREAKPOINT_H
 
-#include <string>
-
+#include "DAPForward.h"
 #include "lldb/API/SBBreakpoint.h"
+#include "lldb/lldb-enumerations.h"
+#include <string>
+#include <utility>
 
 namespace lldb_dap {
 
 struct ExceptionBreakpoint {
+  DAP &dap;
   std::string filter;
   std::string label;
   lldb::LanguageType language;
-  bool default_value;
+  bool default_value = false;
   lldb::SBBreakpoint bp;
-  ExceptionBreakpoint(std::string f, std::string l, lldb::LanguageType lang)
-      : filter(std::move(f)), label(std::move(l)), language(lang),
-        default_value(false), bp() {}
+  ExceptionBreakpoint(DAP &d, std::string f, std::string l,
+                      lldb::LanguageType lang)
+      : dap(d), filter(std::move(f)), label(std::move(l)), language(lang),
+        bp() {}
 
   void SetBreakpoint();
   void ClearBreakpoint();
diff --git a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp
index 216c685..f266d75 100644
--- a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp
+++ b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp
@@ -12,13 +12,13 @@
 
 namespace lldb_dap {
 
-FunctionBreakpoint::FunctionBreakpoint(const llvm::json::Object &obj)
-    : Breakpoint(obj), functionName(std::string(GetString(obj, "name"))) {}
+FunctionBreakpoint::FunctionBreakpoint(DAP &d, const llvm::json::Object &obj)
+    : Breakpoint(d, obj), functionName(std::string(GetString(obj, "name"))) {}
 
 void FunctionBreakpoint::SetBreakpoint() {
   if (functionName.empty())
     return;
-  bp = g_dap.target.BreakpointCreateByName(functionName.c_str());
+  bp = dap.target.BreakpointCreateByName(functionName.c_str());
   Breakpoint::SetBreakpoint();
 }
 
diff --git a/lldb/tools/lldb-dap/FunctionBreakpoint.h b/lldb/tools/lldb-dap/FunctionBreakpoint.h
index b15ff19..93f0b93 100644
--- a/lldb/tools/lldb-dap/FunctionBreakpoint.h
+++ b/lldb/tools/lldb-dap/FunctionBreakpoint.h
@@ -10,14 +10,14 @@
 #define LLDB_TOOLS_LLDB_DAP_FUNCTIONBREAKPOINT_H
 
 #include "Breakpoint.h"
+#include "DAPForward.h"
 
 namespace lldb_dap {
 
 struct FunctionBreakpoint : public Breakpoint {
   std::string functionName;
 
-  FunctionBreakpoint() = default;
-  FunctionBreakpoint(const llvm::json::Object &obj);
+  FunctionBreakpoint(DAP &dap, const llvm::json::Object &obj);
 
   // Set this breakpoint in LLDB as a new breakpoint
   void SetBreakpoint();
diff --git a/lldb/tools/lldb-dap/InstructionBreakpoint.cpp b/lldb/tools/lldb-dap/InstructionBreakpoint.cpp
index e3a8460..37daa8f 100644
--- a/lldb/tools/lldb-dap/InstructionBreakpoint.cpp
+++ b/lldb/tools/lldb-dap/InstructionBreakpoint.cpp
@@ -10,20 +10,25 @@
 #include "InstructionBreakpoint.h"
 #include "DAP.h"
 #include "JSONUtils.h"
+#include "lldb/API/SBBreakpoint.h"
+#include "lldb/API/SBTarget.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace lldb_dap {
 
 // Instruction Breakpoint
-InstructionBreakpoint::InstructionBreakpoint(const llvm::json::Object &obj)
-    : Breakpoint(obj), instructionAddressReference(LLDB_INVALID_ADDRESS), id(0),
+InstructionBreakpoint::InstructionBreakpoint(DAP &d,
+                                             const llvm::json::Object &obj)
+    : Breakpoint(d, obj), instructionAddressReference(LLDB_INVALID_ADDRESS),
       offset(GetSigned(obj, "offset", 0)) {
   GetString(obj, "instructionReference")
       .getAsInteger(0, instructionAddressReference);
   instructionAddressReference += offset;
 }
 
-void InstructionBreakpoint::SetInstructionBreakpoint() {
-  bp = g_dap.target.BreakpointCreateByAddress(instructionAddressReference);
-  id = bp.GetID();
+void InstructionBreakpoint::SetBreakpoint() {
+  bp = dap.target.BreakpointCreateByAddress(instructionAddressReference);
+  Breakpoint::SetBreakpoint();
 }
+
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/InstructionBreakpoint.h b/lldb/tools/lldb-dap/InstructionBreakpoint.h
index 53912af..b2e66a9 100644
--- a/lldb/tools/lldb-dap/InstructionBreakpoint.h
+++ b/lldb/tools/lldb-dap/InstructionBreakpoint.h
@@ -11,6 +11,9 @@
 #define LLDB_TOOLS_LLDB_DAP_INSTRUCTIONBREAKPOINT_H
 
 #include "Breakpoint.h"
+#include "DAPForward.h"
+#include "lldb/lldb-types.h"
+#include <cstdint>
 
 namespace lldb_dap {
 
@@ -18,16 +21,12 @@ namespace lldb_dap {
 struct InstructionBreakpoint : public Breakpoint {
 
   lldb::addr_t instructionAddressReference;
-  int32_t id;
   int32_t offset;
 
-  InstructionBreakpoint()
-      : Breakpoint(), instructionAddressReference(LLDB_INVALID_ADDRESS), id(0),
-        offset(0) {}
-  InstructionBreakpoint(const llvm::json::Object &obj);
+  InstructionBreakpoint(DAP &d, const llvm::json::Object &obj);
 
   // Set instruction breakpoint in LLDB as a new breakpoint
-  void SetInstructionBreakpoint();
+  void SetBreakpoint();
 };
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 97fe6b4..a7300ab 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -6,27 +6,53 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <iomanip>
-#include <optional>
-#include <sstream>
-#include <string.h>
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/ScopedPrinter.h"
+#include "JSONUtils.h"
 
+#include "BreakpointBase.h"
+#include "DAP.h"
+#include "ExceptionBreakpoint.h"
+#include "LLDBUtils.h"
+#include "lldb/API/SBAddress.h"
+#include "lldb/API/SBCompileUnit.h"
 #include "lldb/API/SBDeclaration.h"
+#include "lldb/API/SBEnvironment.h"
+#include "lldb/API/SBError.h"
+#include "lldb/API/SBFileSpec.h"
+#include "lldb/API/SBFrame.h"
+#include "lldb/API/SBFunction.h"
+#include "lldb/API/SBLineEntry.h"
+#include "lldb/API/SBModule.h"
+#include "lldb/API/SBQueue.h"
+#include "lldb/API/SBSection.h"
 #include "lldb/API/SBStream.h"
 #include "lldb/API/SBStringList.h"
 #include "lldb/API/SBStructuredData.h"
+#include "lldb/API/SBTarget.h"
+#include "lldb/API/SBThread.h"
+#include "lldb/API/SBType.h"
 #include "lldb/API/SBValue.h"
-#include "lldb/Host/PosixApi.h"
-
-#include "DAP.h"
-#include "ExceptionBreakpoint.h"
-#include "JSONUtils.h"
-#include "LLDBUtils.h"
+#include "lldb/Host/PosixApi.h" // IWYU pragma: keep
+#include "lldb/lldb-defines.h"
+#include "lldb/lldb-enumerations.h"
+#include "lldb/lldb-types.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include <chrono>
+#include <climits>
+#include <cstddef>
+#include <iomanip>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
 
 namespace lldb_dap {
 
@@ -831,70 +857,6 @@ llvm::json::Value CreateExtendedStackFrameLabel(lldb::SBThread &thread) {
                                               {"presentationHint", "label"}});
 }
 
-// Response to `setInstructionBreakpoints` request.
-// "Breakpoint": {
-//   "type": "object",
-//   "description": "Response to `setInstructionBreakpoints` request.",
-//   "properties": {
-//     "id": {
-//       "type": "number",
-//       "description": "The identifier for the breakpoint. It is needed if
-//       breakpoint events are used to update or remove breakpoints."
-//     },
-//     "verified": {
-//       "type": "boolean",
-//       "description": "If true, the breakpoint could be set (but not
-//       necessarily at the desired location."
-//     },
-//     "message": {
-//       "type": "string",
-//       "description": "A message about the state of the breakpoint.
-//       This is shown to the user and can be used to explain why a breakpoint
-//       could not be verified."
-//     },
-//     "source": {
-//       "type": "Source",
-//       "description": "The source where the breakpoint is located."
-//     },
-//     "line": {
-//       "type": "number",
-//       "description": "The start line of the actual range covered by the
-//       breakpoint."
-//     },
-//     "column": {
-//       "type": "number",
-//       "description": "The start column of the actual range covered by the
-//       breakpoint."
-//     },
-//     "endLine": {
-//       "type": "number",
-//       "description": "The end line of the actual range covered by the
-//       breakpoint."
-//     },
-//     "endColumn": {
-//       "type": "number",
-//       "description": "The end column of the actual range covered by the
-//       breakpoint. If no end line is given, then the end column is assumed to
-//       be in the start line."
-//     },
-//     "instructionReference": {
-//       "type": "string",
-//       "description": "A memory reference to where the breakpoint is set."
-//     },
-//     "offset": {
-//       "type": "number",
-//       "description": "The offset from the instruction reference.
-//       This can be negative."
-//     },
-//   },
-//   "required": [ "id", "verified", "line"]
-// }
-llvm::json::Value CreateInstructionBreakpoint(BreakpointBase *ibp) {
-  llvm::json::Object object;
-  ibp->CreateJsonObject(object);
-  return llvm::json::Value(std::move(object));
-}
-
 // "Thread": {
 //   "type": "object",
 //   "description": "A Thread",
@@ -1523,7 +1485,7 @@ void FilterAndGetValueForKey(const lldb::SBStructuredData data, const char *key,
                              llvm::json::Object &out) {
   lldb::SBStructuredData value = data.GetValueForKey(key);
   std::string key_utf8 = llvm::json::fixUTF8(key);
-  if (strcmp(key, "modules") == 0)
+  if (llvm::StringRef(key) == "modules")
     return;
   switch (value.GetType()) {
   case lldb::eStructuredDataTypeFloat:
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 54fc432..2e13459 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -9,14 +9,21 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_JSONUTILS_H
 #define LLDB_TOOLS_LLDB_DAP_JSONUTILS_H
 
-#include "BreakpointBase.h"
 #include "DAPForward.h"
-#include "lldb/API/SBModule.h"
+#include "lldb/API/SBCompileUnit.h"
+#include "lldb/API/SBFileSpec.h"
+#include "lldb/API/SBLineEntry.h"
+#include "lldb/API/SBType.h"
+#include "lldb/API/SBValue.h"
+#include "lldb/lldb-types.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/JSON.h"
 #include <cstdint>
 #include <optional>
+#include <string>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
 namespace lldb_dap {
 
@@ -380,17 +387,6 @@ llvm::json::Value CreateStackFrame(lldb::SBFrame &frame);
 ///     definition outlined by Microsoft.
 llvm::json::Value CreateExtendedStackFrameLabel(lldb::SBThread &thread);
 
-/// Create a "instruction" object for a LLDB disassemble object as described in
-/// the Visual Studio Code debug adaptor definition.
-///
-/// \param[in] bp
-///     The LLDB instruction object used to populate the disassembly
-///     instruction.
-/// \return
-///     A "Scope" JSON object with that follows the formal JSON
-///     definition outlined by Microsoft.
-llvm::json::Value CreateInstructionBreakpoint(BreakpointBase *ibp);
-
 /// Create a "Thread" object for a LLDB thread object.
 ///
 /// This function will fill in the following keys in the returned
diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.cpp b/lldb/tools/lldb-dap/SourceBreakpoint.cpp
index d1a3a5b..418e205 100644
--- a/lldb/tools/lldb-dap/SourceBreakpoint.cpp
+++ b/lldb/tools/lldb-dap/SourceBreakpoint.cpp
@@ -7,20 +7,33 @@
 //===----------------------------------------------------------------------===//
 
 #include "SourceBreakpoint.h"
+#include "BreakpointBase.h"
 #include "DAP.h"
 #include "JSONUtils.h"
+#include "lldb/API/SBBreakpoint.h"
+#include "lldb/API/SBFileSpecList.h"
+#include "lldb/API/SBFrame.h"
+#include "lldb/API/SBTarget.h"
+#include "lldb/API/SBThread.h"
+#include "lldb/API/SBValue.h"
+#include "lldb/lldb-enumerations.h"
+#include <cassert>
+#include <cctype>
+#include <cstdlib>
+#include <utility>
 
 namespace lldb_dap {
 
-SourceBreakpoint::SourceBreakpoint(const llvm::json::Object &obj)
-    : Breakpoint(obj), logMessage(std::string(GetString(obj, "logMessage"))),
+SourceBreakpoint::SourceBreakpoint(DAP &dap, const llvm::json::Object &obj)
+    : Breakpoint(dap, obj),
+      logMessage(std::string(GetString(obj, "logMessage"))),
       line(GetUnsigned(obj, "line", 0)), column(GetUnsigned(obj, "column", 0)) {
 }
 
 void SourceBreakpoint::SetBreakpoint(const llvm::StringRef source_path) {
   lldb::SBFileSpecList module_list;
-  bp = g_dap.target.BreakpointCreateByLocation(source_path.str().c_str(), line,
-                                               column, 0, module_list);
+  bp = dap.target.BreakpointCreateByLocation(source_path.str().c_str(), line,
+                                             column, 0, module_list);
   if (!logMessage.empty())
     SetLogMessage();
   Breakpoint::SetBreakpoint();
@@ -136,7 +149,7 @@ lldb::SBError SourceBreakpoint::FormatLogText(llvm::StringRef text,
         return error;
       }
       // hex number in the text
-      if (isxdigit(text[0])) {
+      if (std::isxdigit(text[0])) {
         // Make a string that can hold onto two hex chars plus a
         // NULL terminator
         char hex_str[3] = {0, 0, 0};
@@ -144,7 +157,7 @@ lldb::SBError SourceBreakpoint::FormatLogText(llvm::StringRef text,
 
         text = text.drop_front();
 
-        if (!text.empty() && isxdigit(text[0])) {
+        if (!text.empty() && std::isxdigit(text[0])) {
           hex_str[1] = text[0];
           text = text.drop_front();
         }
@@ -279,7 +292,7 @@ void SourceBreakpoint::SetLogMessage() {
 void SourceBreakpoint::NotifyLogMessageError(llvm::StringRef error) {
   std::string message = "Log message has error: ";
   message += error;
-  g_dap.SendOutput(OutputType::Console, message);
+  dap.SendOutput(OutputType::Console, message);
 }
 
 /*static*/
@@ -304,14 +317,16 @@ bool SourceBreakpoint::BreakpointHitCallback(
           frame.GetValueForVariablePath(expr, lldb::eDynamicDontRunTarget);
       if (value.GetError().Fail())
         value = frame.EvaluateExpression(expr);
-      output += VariableDescription(value).display_value;
+      output +=
+          VariableDescription(value, bp->dap.enable_auto_variable_summaries)
+              .display_value;
     } else {
       output += messagePart.text;
     }
   }
   if (!output.empty() && output.back() != '\n')
     output.push_back('\n'); // Ensure log message has line break.
-  g_dap.SendOutput(OutputType::Console, output.c_str());
+  bp->dap.SendOutput(OutputType::Console, output.c_str());
 
   // Do not stop.
   return false;
diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.h b/lldb/tools/lldb-dap/SourceBreakpoint.h
index aa3fbe6..064bd29 100644
--- a/lldb/tools/lldb-dap/SourceBreakpoint.h
+++ b/lldb/tools/lldb-dap/SourceBreakpoint.h
@@ -10,7 +10,12 @@
 #define LLDB_TOOLS_LLDB_DAP_SOURCEBREAKPOINT_H
 
 #include "Breakpoint.h"
+#include "DAPForward.h"
+#include "lldb/API/SBError.h"
 #include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <string>
+#include <vector>
 
 namespace lldb_dap {
 
@@ -31,8 +36,7 @@ struct SourceBreakpoint : public Breakpoint {
   uint32_t line;   ///< The source line of the breakpoint or logpoint
   uint32_t column; ///< An optional source column of the breakpoint
 
-  SourceBreakpoint() : Breakpoint(), line(0), column(0) {}
-  SourceBreakpoint(const llvm::json::Object &obj);
+  SourceBreakpoint(DAP &d, const llvm::json::Object &obj);
 
   // Set this breakpoint in LLDB as a new breakpoint
   void SetBreakpoint(const llvm::StringRef source_path);
diff --git a/lldb/tools/lldb-dap/Watchpoint.cpp b/lldb/tools/lldb-dap/Watchpoint.cpp
index 2176550..0e68a35 100644
--- a/lldb/tools/lldb-dap/Watchpoint.cpp
+++ b/lldb/tools/lldb-dap/Watchpoint.cpp
@@ -9,10 +9,17 @@
 #include "Watchpoint.h"
 #include "DAP.h"
 #include "JSONUtils.h"
+#include "lldb/API/SBTarget.h"
+#include "lldb/lldb-enumerations.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/JSON.h"
+#include <cstdint>
+#include <string>
 
 namespace lldb_dap {
-Watchpoint::Watchpoint(const llvm::json::Object &obj) : BreakpointBase(obj) {
+Watchpoint::Watchpoint(DAP &d, const llvm::json::Object &obj)
+    : BreakpointBase(d, obj) {
   llvm::StringRef dataId = GetString(obj, "dataId");
   std::string accessType = GetString(obj, "accessType").str();
   auto [addr_str, size_str] = dataId.split('/');
@@ -42,7 +49,7 @@ void Watchpoint::CreateJsonObject(llvm::json::Object &object) {
 }
 
 void Watchpoint::SetWatchpoint() {
-  wp = g_dap.target.WatchpointCreateByAddress(addr, size, options, error);
+  wp = dap.target.WatchpointCreateByAddress(addr, size, options, error);
   if (!condition.empty())
     SetCondition();
   if (!hitCondition.empty())
diff --git a/lldb/tools/lldb-dap/Watchpoint.h b/lldb/tools/lldb-dap/Watchpoint.h
index 4d2e58e..77cea67 100644
--- a/lldb/tools/lldb-dap/Watchpoint.h
+++ b/lldb/tools/lldb-dap/Watchpoint.h
@@ -10,9 +10,12 @@
 #define LLDB_TOOLS_LLDB_DAP_WATCHPOINT_H
 
 #include "BreakpointBase.h"
+#include "DAPForward.h"
 #include "lldb/API/SBError.h"
 #include "lldb/API/SBWatchpoint.h"
 #include "lldb/API/SBWatchpointOptions.h"
+#include "lldb/lldb-types.h"
+#include <cstddef>
 
 namespace lldb_dap {
 
@@ -24,9 +27,8 @@ struct Watchpoint : public BreakpointBase {
   lldb::SBWatchpoint wp;
   lldb::SBError error;
 
-  Watchpoint() = default;
-  Watchpoint(const llvm::json::Object &obj);
-  Watchpoint(lldb::SBWatchpoint wp) : wp(wp) {}
+  Watchpoint(DAP &d, const llvm::json::Object &obj);
+  Watchpoint(DAP &d, lldb::SBWatchpoint wp) : BreakpointBase(d), wp(wp) {}
 
   void SetCondition() override;
   void SetHitCondition() override;
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index a2f7be2..94184b7 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -8,24 +8,52 @@
 
 #include "DAP.h"
 #include "FifoFiles.h"
+#include "JSONUtils.h"
+#include "LLDBUtils.h"
+#include "OutputRedirector.h"
 #include "RunInTerminal.h"
 #include "Watchpoint.h"
 #include "lldb/API/SBDeclaration.h"
 #include "lldb/API/SBInstruction.h"
 #include "lldb/API/SBListener.h"
 #include "lldb/API/SBMemoryRegionInfo.h"
+#include "lldb/API/SBStream.h"
 #include "lldb/API/SBStringList.h"
+#include "lldb/Host/Config.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/OptTable.h"
+#include "llvm/Option/Option.h"
 #include "llvm/Support/Base64.h"
-
+#include "llvm/Support/Errno.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <array>
 #include <cassert>
 #include <climits>
 #include <cstdarg>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <map>
+#include <memory>
 #include <optional>
+#include <set>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <thread>
+#include <vector>
+
 #if defined(_WIN32)
 // We need to #define NOMINMAX in order to skip `min()` and `max()` macro
 // definitions that conflict with other system headers.
@@ -46,35 +74,6 @@
 #include <sys/prctl.h>
 #endif
 
-#include <algorithm>
-#include <array>
-#include <map>
-#include <memory>
-#include <set>
-#include <thread>
-#include <vector>
-
-#include "lldb/API/SBStream.h"
-#include "lldb/Host/Config.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Option/Arg.h"
-#include "llvm/Option/ArgList.h"
-#include "llvm/Option/OptTable.h"
-#include "llvm/Option/Option.h"
-#include "llvm/Support/Errno.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include "JSONUtils.h"
-#include "LLDBUtils.h"
-#include "OutputRedirector.h"
-
 #if defined(_WIN32)
 #ifndef PATH_MAX
 #define PATH_MAX MAX_PATH
@@ -526,8 +525,8 @@ void EventThreadFunction() {
         if (event_mask & lldb::SBTarget::eBroadcastBitBreakpointChanged) {
           auto event_type =
               lldb::SBBreakpoint::GetBreakpointEventTypeFromEvent(event);
-          auto bp =
-              Breakpoint(lldb::SBBreakpoint::GetBreakpointFromEvent(event));
+          auto bp = Breakpoint(
+              g_dap, lldb::SBBreakpoint::GetBreakpointFromEvent(event));
           // If the breakpoint was originated from the IDE, it will have the
           // BreakpointBase::GetBreakpointLabel() label attached. Regardless
           // of wether the locations were added or removed, the breakpoint
@@ -2689,10 +2688,10 @@ void request_setBreakpoints(const llvm::json::Object &request) {
   llvm::json::Object response;
   lldb::SBError error;
   FillResponse(request, response);
-  auto arguments = request.getObject("arguments");
-  auto source = arguments->getObject("source");
+  const auto *arguments = request.getObject("arguments");
+  const auto *source = arguments->getObject("source");
   const auto path = GetString(source, "path");
-  auto breakpoints = arguments->getArray("breakpoints");
+  const auto *breakpoints = arguments->getArray("breakpoints");
   llvm::json::Array response_breakpoints;
 
   // Decode the source breakpoint infos for this "setBreakpoints" request
@@ -2701,28 +2700,19 @@ void request_setBreakpoints(const llvm::json::Object &request) {
   // to an empty array.
   if (breakpoints) {
     for (const auto &bp : *breakpoints) {
-      auto bp_obj = bp.getAsObject();
+      const auto *bp_obj = bp.getAsObject();
       if (bp_obj) {
-        SourceBreakpoint src_bp(*bp_obj);
-        request_bps[src_bp.line] = src_bp;
-
+        SourceBreakpoint src_bp(g_dap, *bp_obj);
+        request_bps.try_emplace(src_bp.line, src_bp);
+        const auto [iv, inserted] =
+            g_dap.source_breakpoints[path].try_emplace(src_bp.line, src_bp);
         // We check if this breakpoint already exists to update it
-        auto existing_source_bps = g_dap.source_breakpoints.find(path);
-        if (existing_source_bps != g_dap.source_breakpoints.end()) {
-          const auto &existing_bp =
-              existing_source_bps->second.find(src_bp.line);
-          if (existing_bp != existing_source_bps->second.end()) {
-            existing_bp->second.UpdateBreakpoint(src_bp);
-            AppendBreakpoint(&existing_bp->second, response_breakpoints, path,
-                             src_bp.line);
-            continue;
-          }
-        }
-        // At this point the breakpoint is new
-        g_dap.source_breakpoints[path][src_bp.line] = src_bp;
-        SourceBreakpoint &new_bp = g_dap.source_breakpoints[path][src_bp.line];
-        new_bp.SetBreakpoint(path.data());
-        AppendBreakpoint(&new_bp, response_breakpoints, path, new_bp.line);
+        if (inserted)
+          iv->getSecond().SetBreakpoint(path.data());
+        else
+          iv->getSecond().UpdateBreakpoint(src_bp);
+        AppendBreakpoint(&iv->getSecond(), response_breakpoints, path,
+                         src_bp.line);
       }
     }
   }
@@ -2799,8 +2789,8 @@ void request_setExceptionBreakpoints(const llvm::json::Object &request) {
   llvm::json::Object response;
   lldb::SBError error;
   FillResponse(request, response);
-  auto arguments = request.getObject("arguments");
-  auto filters = arguments->getArray("filters");
+  const auto *arguments = request.getObject("arguments");
+  const auto *filters = arguments->getArray("filters");
   // Keep a list of any exception breakpoint filter names that weren't set
   // so we can clear any exception breakpoints if needed.
   std::set<std::string> unset_filters;
@@ -2809,14 +2799,14 @@ void request_setExceptionBreakpoints(const llvm::json::Object &request) {
 
   for (const auto &value : *filters) {
     const auto filter = GetAsString(value);
-    auto exc_bp = g_dap.GetExceptionBreakpoint(std::string(filter));
+    auto *exc_bp = g_dap.GetExceptionBreakpoint(std::string(filter));
     if (exc_bp) {
       exc_bp->SetBreakpoint();
       unset_filters.erase(std::string(filter));
     }
   }
   for (const auto &filter : unset_filters) {
-    auto exc_bp = g_dap.GetExceptionBreakpoint(filter);
+    auto *exc_bp = g_dap.GetExceptionBreakpoint(filter);
     if (exc_bp)
       exc_bp->ClearBreakpoint();
   }
@@ -2905,51 +2895,38 @@ void request_setFunctionBreakpoints(const llvm::json::Object &request) {
   llvm::json::Object response;
   lldb::SBError error;
   FillResponse(request, response);
-  auto arguments = request.getObject("arguments");
-  auto breakpoints = arguments->getArray("breakpoints");
-  FunctionBreakpointMap request_bps;
+  const auto *arguments = request.getObject("arguments");
+  const auto *breakpoints = arguments->getArray("breakpoints");
   llvm::json::Array response_breakpoints;
-  for (const auto &value : *breakpoints) {
-    auto bp_obj = value.getAsObject();
-    if (bp_obj == nullptr)
-      continue;
-    FunctionBreakpoint func_bp(*bp_obj);
-    request_bps[func_bp.functionName] = std::move(func_bp);
-  }
 
-  std::vector<llvm::StringRef> remove_names;
-  // Disable any function breakpoints that aren't in the request_bps.
+  // Disable any function breakpoints that aren't in this request.
   // There is no call to remove function breakpoints other than calling this
   // function with a smaller or empty "breakpoints" list.
-  for (auto &pair : g_dap.function_breakpoints) {
-    auto request_pos = request_bps.find(pair.first());
-    if (request_pos == request_bps.end()) {
-      // This function breakpoint no longer exists delete it from LLDB
-      g_dap.target.BreakpointDelete(pair.second.bp.GetID());
-      remove_names.push_back(pair.first());
-    } else {
-      // Update the existing breakpoint as any setting withing the function
-      // breakpoint might have changed.
-      pair.second.UpdateBreakpoint(request_pos->second);
-      // Remove this breakpoint from the request breakpoints since we have
-      // handled it here and we don't need to set a new breakpoint below.
-      request_bps.erase(request_pos);
-      // Add this breakpoint info to the response
-      AppendBreakpoint(&pair.second, response_breakpoints);
-    }
+  const auto name_iter = g_dap.function_breakpoints.keys();
+  llvm::DenseSet<llvm::StringRef> seen(name_iter.begin(), name_iter.end());
+  for (const auto &value : *breakpoints) {
+    const auto *bp_obj = value.getAsObject();
+    if (!bp_obj)
+      continue;
+    FunctionBreakpoint fn_bp(g_dap, *bp_obj);
+    const auto [it, inserted] = g_dap.function_breakpoints.try_emplace(
+        fn_bp.functionName, g_dap, *bp_obj);
+    if (inserted)
+      it->second.SetBreakpoint();
+    else
+      it->second.UpdateBreakpoint(fn_bp);
+
+    AppendBreakpoint(&it->second, response_breakpoints);
+    seen.erase(fn_bp.functionName);
   }
+
   // Remove any breakpoints that are no longer in our list
-  for (const auto &name : remove_names)
+  for (const auto &name : seen) {
+    auto fn_bp = g_dap.function_breakpoints.find(name);
+    if (fn_bp == g_dap.function_breakpoints.end())
+      continue;
+    g_dap.target.BreakpointDelete(fn_bp->second.bp.GetID());
     g_dap.function_breakpoints.erase(name);
-
-  // Any breakpoints that are left in "request_bps" are breakpoints that
-  // need to be set.
-  for (auto &pair : request_bps) {
-    // Add this breakpoint info to the response
-    g_dap.function_breakpoints[pair.first()] = std::move(pair.second);
-    FunctionBreakpoint &new_bp = g_dap.function_breakpoints[pair.first()];
-    new_bp.SetBreakpoint();
-    AppendBreakpoint(&new_bp, response_breakpoints);
   }
 
   llvm::json::Object body;
@@ -3201,10 +3178,8 @@ void request_setDataBreakpoints(const llvm::json::Object &request) {
   if (breakpoints) {
     for (const auto &bp : *breakpoints) {
       const auto *bp_obj = bp.getAsObject();
-      if (bp_obj) {
-        Watchpoint wp(*bp_obj);
-        watchpoints.push_back(wp);
-      }
+      if (bp_obj)
+        watchpoints.emplace_back(g_dap, *bp_obj);
     }
   }
   // If two watchpoints start at the same address, the latter overwrite the
@@ -4551,7 +4526,7 @@ void request__testGetTargetBreakpoints(const llvm::json::Object &request) {
   FillResponse(request, response);
   llvm::json::Array response_breakpoints;
   for (uint32_t i = 0; g_dap.target.GetBreakpointAtIndex(i).IsValid(); ++i) {
-    auto bp = Breakpoint(g_dap.target.GetBreakpointAtIndex(i));
+    auto bp = Breakpoint(g_dap, g_dap.target.GetBreakpointAtIndex(i));
     AppendBreakpoint(&bp, response_breakpoints);
   }
   llvm::json::Object body;
@@ -4560,10 +4535,11 @@ void request__testGetTargetBreakpoints(const llvm::json::Object &request) {
   g_dap.SendJSON(llvm::json::Value(std::move(response)));
 }
 
-// "SetInstructionBreakpointsRequest" : {
-//   "allOf" : [
-//     {"$ref" : "#/definitions/Request"}, {
-//       "type" : "object",
+// "SetInstructionBreakpointsRequest": {
+//   "allOf": [
+//     {"$ref": "#/definitions/Request"},
+//     {
+//       "type": "object",
 //       "description" :
 //           "Replaces all existing instruction breakpoints. Typically, "
 //           "instruction breakpoints would be set from a disassembly window. "
@@ -4572,235 +4548,218 @@ void request__testGetTargetBreakpoints(const llvm::json::Object &request) {
 //           "(with reason `instruction breakpoint`) is generated.\nClients "
 //           "should only call this request if the corresponding capability "
 //           "`supportsInstructionBreakpoints` is true.",
-//       "properties" : {
-//         "command" : {"type" : "string", "enum" :
-//         ["setInstructionBreakpoints"]}, "arguments" :
-//             {"$ref" : "#/definitions/SetInstructionBreakpointsArguments"}
+//       "properties": {
+//         "command": { "type": "string", "enum": ["setInstructionBreakpoints"]
+//         }, "arguments": {"$ref":
+//         "#/definitions/SetInstructionBreakpointsArguments"}
 //       },
-//       "required" : [ "command", "arguments" ]
+//       "required": [ "command", "arguments" ]
 //     }
 //   ]
 // },
-//                                      "SetInstructionBreakpointsArguments"
-//     : {
-//       "type" : "object",
-//       "description" : "Arguments for `setInstructionBreakpoints` request",
-//       "properties" : {
-//         "breakpoints" : {
-//           "type" : "array",
-//           "items" : {"$ref" : "#/definitions/InstructionBreakpoint"},
-//           "description" : "The instruction references of the breakpoints"
-//         }
-//       },
-//       "required" : ["breakpoints"]
-//     },
-//       "SetInstructionBreakpointsResponse"
-//     : {
-//       "allOf" : [
-//         {"$ref" : "#/definitions/Response"}, {
-//           "type" : "object",
-//           "description" : "Response to `setInstructionBreakpoints` request",
-//           "properties" : {
-//             "body" : {
-//               "type" : "object",
-//               "properties" : {
-//                 "breakpoints" : {
-//                   "type" : "array",
-//                   "items" : {"$ref" : "#/definitions/Breakpoint"},
-//                   "description" :
-//                       "Information about the breakpoints. The array elements
-//                       " "correspond to the elements of the `breakpoints`
-//                       array."
-//                 }
-//               },
-//               "required" : ["breakpoints"]
+// "SetInstructionBreakpointsArguments": {
+//   "type": "object",
+//   "description": "Arguments for `setInstructionBreakpoints` request",
+//   "properties": {
+//     "breakpoints": {
+//       "type": "array",
+//       "items": {"$ref": "#/definitions/InstructionBreakpoint"},
+//       "description": "The instruction references of the breakpoints"
+//     }
+//   },
+//   "required": ["breakpoints"]
+// },
+// "SetInstructionBreakpointsResponse": {
+//   "allOf": [
+//     {"$ref": "#/definitions/Response"},
+//     {
+//       "type": "object",
+//       "description": "Response to `setInstructionBreakpoints` request",
+//       "properties": {
+//         "body": {
+//           "type": "object",
+//           "properties": {
+//             "breakpoints": {
+//               "type": "array",
+//               "items": {"$ref": "#/definitions/Breakpoint"},
+//               "description":
+//                   "Information about the breakpoints. The array elements
+//                   " "correspond to the elements of the `breakpoints`
+//                   array."
 //             }
 //           },
-//           "required" : ["body"]
+//           "required": ["breakpoints"]
 //         }
-//       ]
-//     },
-// "InstructionBreakpoint" : {
-//   "type" : "object",
-//   "description" : "Properties of a breakpoint passed to the "
+//       },
+//       "required": ["body"]
+//     }
+//   ]
+// },
+// "InstructionBreakpoint": {
+//   "type": "object",
+//   "description": "Properties of a breakpoint passed to the "
 //                   "`setInstructionBreakpoints` request",
-//   "properties" : {
-//     "instructionReference" : {
-//       "type" : "string",
+//   "properties": {
+//     "instructionReference": {
+//       "type": "string",
 //       "description" :
 //           "The instruction reference of the breakpoint.\nThis should be a "
 //           "memory or instruction pointer reference from an
 //           `EvaluateResponse`, "
 //           "`Variable`, `StackFrame`, `GotoTarget`, or `Breakpoint`."
 //     },
-//     "offset" : {
-//       "type" : "integer",
-//       "description" : "The offset from the instruction reference in "
+//     "offset": {
+//       "type": "integer",
+//       "description": "The offset from the instruction reference in "
 //                       "bytes.\nThis can be negative."
 //     },
-//     "condition" : {
-//       "type" : "string",
-//       "description" : "An expression for conditional breakpoints.\nIt is only
+//     "condition": {
+//       "type": "string",
+//       "description": "An expression for conditional breakpoints.\nIt is only
 //       "
 //                       "honored by a debug adapter if the corresponding "
 //                       "capability `supportsConditionalBreakpoints` is true."
 //     },
-//     "hitCondition" : {
-//       "type" : "string",
-//       "description" : "An expression that controls how many hits of the "
+//     "hitCondition": {
+//       "type": "string",
+//       "description": "An expression that controls how many hits of the "
 //                       "breakpoint are ignored.\nThe debug adapter is expected
 //                       " "to interpret the expression as needed.\nThe
 //                       attribute " "is only honored by a debug adapter if the
 //                       corresponding " "capability
 //                       `supportsHitConditionalBreakpoints` is true."
 //     },
-//     "mode" : {
-//       "type" : "string",
-//       "description" : "The mode of this breakpoint. If defined, this must be
+//     "mode": {
+//       "type": "string",
+//       "description": "The mode of this breakpoint. If defined, this must be
 //       "
 //                       "one of the `breakpointModes` the debug adapter "
 //                       "advertised in its `Capabilities`."
 //     }
 //   },
-//   "required" : ["instructionReference"]
+//   "required": ["instructionReference"]
 // },
-// "Breakpoint"
-//     : {
-//       "type" : "object",
+// "Breakpoint": {
+//   "type": "object",
+//   "description" :
+//       "Information about a breakpoint created in `setBreakpoints`, "
+//       "`setFunctionBreakpoints`, `setInstructionBreakpoints`, or "
+//       "`setDataBreakpoints` requests.",
+//   "properties": {
+//     "id": {
+//       "type": "integer",
 //       "description" :
-//           "Information about a breakpoint created in `setBreakpoints`, "
-//           "`setFunctionBreakpoints`, `setInstructionBreakpoints`, or "
-//           "`setDataBreakpoints` requests.",
-//       "properties" : {
-//         "id" : {
-//           "type" : "integer",
-//           "description" :
-//               "The identifier for the breakpoint. It is needed if breakpoint
-//               " "events are used to update or remove breakpoints."
-//         },
-//         "verified" : {
-//           "type" : "boolean",
-//           "description" : "If true, the breakpoint could be set (but not "
-//                           "necessarily at the desired location)."
-//         },
-//         "message" : {
-//           "type" : "string",
-//           "description" : "A message about the state of the breakpoint.\nThis
-//           "
-//                           "is shown to the user and can be used to explain
-//                           why " "a breakpoint could not be verified."
-//         },
-//         "source" : {
-//           "$ref" : "#/definitions/Source",
-//           "description" : "The source where the breakpoint is located."
-//         },
-//         "line" : {
-//           "type" : "integer",
-//           "description" :
-//               "The start line of the actual range covered by the breakpoint."
-//         },
-//         "column" : {
-//           "type" : "integer",
-//           "description" :
-//               "Start position of the source range covered by the breakpoint.
-//               " "It is measured in UTF-16 code units and the client
-//               capability "
-//               "`columnsStartAt1` determines whether it is 0- or 1-based."
-//         },
-//         "endLine" : {
-//           "type" : "integer",
-//           "description" :
-//               "The end line of the actual range covered by the breakpoint."
-//         },
-//         "endColumn" : {
-//           "type" : "integer",
-//           "description" :
-//               "End position of the source range covered by the breakpoint. It
-//               " "is measured in UTF-16 code units and the client capability "
-//               "`columnsStartAt1` determines whether it is 0- or 1-based.\nIf
-//               " "no end line is given, then the end column is assumed to be
-//               in " "the start line."
-//         },
-//         "instructionReference" : {
-//           "type" : "string",
-//           "description" : "A memory reference to where the breakpoint is
-//           set."
-//         },
-//         "offset" : {
-//           "type" : "integer",
-//           "description" : "The offset from the instruction reference.\nThis "
-//                           "can be negative."
-//         },
-//         "reason" : {
-//           "type" : "string",
-//           "description" :
-//               "A machine-readable explanation of why a breakpoint may not be
-//               " "verified. If a breakpoint is verified or a specific reason
-//               is " "not known, the adapter should omit this property.
-//               Possible " "values include:\n\n- `pending`: Indicates a
-//               breakpoint might be " "verified in the future, but the adapter
-//               cannot verify it in the " "current state.\n - `failed`:
-//               Indicates a breakpoint was not " "able to be verified, and the
-//               adapter does not believe it can be " "verified without
-//               intervention.",
-//           "enum" : [ "pending", "failed" ]
-//         }
-//       },
-//       "required" : ["verified"]
+//           "The identifier for the breakpoint. It is needed if breakpoint
+//           " "events are used to update or remove breakpoints."
 //     },
-
+//     "verified": {
+//       "type": "boolean",
+//       "description": "If true, the breakpoint could be set (but not "
+//                       "necessarily at the desired location)."
+//     },
+//     "message": {
+//       "type": "string",
+//       "description": "A message about the state of the breakpoint.\nThis
+//       "
+//                       "is shown to the user and can be used to explain
+//                       why " "a breakpoint could not be verified."
+//     },
+//     "source": {
+//       "$ref": "#/definitions/Source",
+//       "description": "The source where the breakpoint is located."
+//     },
+//     "line": {
+//       "type": "integer",
+//       "description" :
+//           "The start line of the actual range covered by the breakpoint."
+//     },
+//     "column": {
+//       "type": "integer",
+//       "description" :
+//           "Start position of the source range covered by the breakpoint.
+//           " "It is measured in UTF-16 code units and the client
+//           capability "
+//           "`columnsStartAt1` determines whether it is 0- or 1-based."
+//     },
+//     "endLine": {
+//       "type": "integer",
+//       "description" :
+//           "The end line of the actual range covered by the breakpoint."
+//     },
+//     "endColumn": {
+//       "type": "integer",
+//       "description" :
+//           "End position of the source range covered by the breakpoint. It
+//           " "is measured in UTF-16 code units and the client capability "
+//           "`columnsStartAt1` determines whether it is 0- or 1-based.\nIf
+//           " "no end line is given, then the end column is assumed to be
+//           in " "the start line."
+//     },
+//     "instructionReference": {
+//       "type": "string",
+//       "description": "A memory reference to where the breakpoint is
+//       set."
+//     },
+//     "offset": {
+//       "type": "integer",
+//       "description": "The offset from the instruction reference.\nThis "
+//                       "can be negative."
+//     },
+//     "reason": {
+//       "type": "string",
+//       "description" :
+//           "A machine-readable explanation of why a breakpoint may not be
+//           " "verified. If a breakpoint is verified or a specific reason
+//           is " "not known, the adapter should omit this property.
+//           Possible " "values include:\n\n- `pending`: Indicates a
+//           breakpoint might be " "verified in the future, but the adapter
+//           cannot verify it in the " "current state.\n - `failed`:
+//           Indicates a breakpoint was not " "able to be verified, and the
+//           adapter does not believe it can be " "verified without
+//           intervention.",
+//       "enum": [ "pending", "failed" ]
+//     }
+//   },
+//   "required": ["verified"]
+// },
 void request_setInstructionBreakpoints(const llvm::json::Object &request) {
   llvm::json::Object response;
   llvm::json::Array response_breakpoints;
   llvm::json::Object body;
   FillResponse(request, response);
 
-  auto arguments = request.getObject("arguments");
-  auto breakpoints = arguments->getArray("breakpoints");
+  const auto *arguments = request.getObject("arguments");
+  const auto *breakpoints = arguments->getArray("breakpoints");
 
-  // It holds active instruction breakpoint list received from DAP.
-  InstructionBreakpointMap request_ibp;
-  if (breakpoints) {
-    for (const auto &bp : *breakpoints) {
-      auto bp_obj = bp.getAsObject();
-      if (bp_obj) {
-        // Read instruction breakpoint request.
-        InstructionBreakpoint inst_bp(*bp_obj);
-        // Store them into map for reference.
-        request_ibp[inst_bp.instructionAddressReference] = std::move(inst_bp);
-      }
-    }
+  // Disable any instruction breakpoints that aren't in this request.
+  // There is no call to remove instruction breakpoints other than calling this
+  // function with a smaller or empty "breakpoints" list.
+  llvm::DenseSet<lldb::addr_t> seen;
+  for (const auto &addr : g_dap.instruction_breakpoints)
+    seen.insert(addr.first);
 
-    // Iterate previous active instruction breakpoint list.
-    for (auto &prev_ibp : g_dap.instruction_breakpoints) {
-      // Find previous instruction breakpoint reference address in newly
-      // received instruction breakpoint list.
-      auto inst_reference = request_ibp.find(prev_ibp.first);
-      // Request for remove and delete the breakpoint, if the prev instruction
-      // breakpoint ID is not available in active instrcation breakpoint list.
-      // Means delete removed breakpoint instance.
-      if (inst_reference == request_ibp.end()) {
-        g_dap.target.BreakpointDelete(prev_ibp.second.id);
-        // Update Prev instruction breakpoint list.
-        g_dap.instruction_breakpoints.erase(prev_ibp.first);
-      } else {
-        // Instead of recreating breakpoint instance, update the breakpoint if
-        // there are any conditional changes.
-        prev_ibp.second.UpdateBreakpoint(inst_reference->second);
-        request_ibp.erase(inst_reference);
-        response_breakpoints.emplace_back(
-            CreateInstructionBreakpoint(&prev_ibp.second));
-      }
-    }
+  for (const auto &bp : *breakpoints) {
+    const auto *bp_obj = bp.getAsObject();
+    if (!bp_obj)
+      continue;
+    // Read instruction breakpoint request.
+    InstructionBreakpoint inst_bp(g_dap, *bp_obj);
+    const auto [iv, inserted] = g_dap.instruction_breakpoints.try_emplace(
+        inst_bp.instructionAddressReference, g_dap, *bp_obj);
+    if (inserted)
+      iv->second.SetBreakpoint();
+    else
+      iv->second.UpdateBreakpoint(inst_bp);
+    AppendBreakpoint(&iv->second, response_breakpoints);
+    seen.erase(inst_bp.instructionAddressReference);
+  }
 
-    for (auto &req_bpi : request_ibp) {
-      // Add this breakpoint info to the response
-      g_dap.instruction_breakpoints[req_bpi.first] = std::move(req_bpi.second);
-      InstructionBreakpoint &new_bp =
-          g_dap.instruction_breakpoints[req_bpi.first];
-      new_bp.SetInstructionBreakpoint();
-      response_breakpoints.emplace_back(CreateInstructionBreakpoint(&new_bp));
-    }
+  for (const auto &addr : seen) {
+    auto inst_bp = g_dap.instruction_breakpoints.find(addr);
+    if (inst_bp == g_dap.instruction_breakpoints.end())
+      continue;
+    g_dap.target.BreakpointDelete(inst_bp->second.bp.GetID());
+    g_dap.instruction_breakpoints.erase(addr);
   }
 
   body.try_emplace("breakpoints", std::move(response_breakpoints));
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 72573fa..cd2022e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -321,6 +321,9 @@ public:
   bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal);
   void applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal);
 
+  // Transform a G_SUB with constant on the RHS to G_ADD.
+  bool matchCombineSubToAdd(MachineInstr &MI, BuildFnTy &MatchInfo);
+
   // Transform a G_SHL with an extended source into a narrower shift if
   // possible.
   bool matchCombineShlOfExtend(MachineInstr &MI, RegisterImmPair &MatchData);
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index 8f045d6..dfdfa05 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -582,6 +582,7 @@ struct VerdefEntry {
   std::optional<uint16_t> Flags;
   std::optional<uint16_t> VersionNdx;
   std::optional<uint32_t> Hash;
+  std::optional<uint16_t> VDAux;
   std::vector<StringRef> VerNames;
 };
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 5928b36..6da089b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -335,6 +335,14 @@ def mul_to_shl : GICombineRule<
          [{ return Helper.matchCombineMulToShl(*${mi}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineMulToShl(*${mi}, ${matchinfo}); }])>;
 
+// (sub x, C) -> (add x, -C)
+def sub_to_add : GICombineRule<
+  (defs root:$d, build_fn_matchinfo:$matchinfo),
+  (match (G_CONSTANT $c, $imm),
+         (G_SUB $d, $op1, $c):$mi,
+         [{ return Helper.matchCombineSubToAdd(*${mi}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFnNoErase(*${mi}, ${matchinfo}); }])>;
+
 // shl ([asz]ext x), y => zext (shl x, y), if shift does not overflow int
 def reduce_shl_of_extend_matchdata : GIDefMatchData<"RegisterImmPair">;
 def reduce_shl_of_extend : GICombineRule<
@@ -1912,8 +1920,9 @@ def bitreverse_shift : GICombineGroup<[bitreverse_shl, bitreverse_lshr]>;
 def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp,
                                       select_to_iminmax, match_selects]>;
 
-def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd,
-                                       mul_by_neg_one, idempotent_prop]>;
+def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, sub_to_add,
+                                       add_p2i_to_ptradd, mul_by_neg_one,
+                                       idempotent_prop]>;
 
 def fma_combines : GICombineGroup<[combine_fadd_fmul_to_fmad_or_fma,
   combine_fadd_fpext_fmul_to_fmad_or_fma, combine_fadd_fma_fmul_to_fmad_or_fma,
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
index 18e34bc..02cd765 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
@@ -25,10 +25,12 @@ namespace llvm::sandboxir {
 class BottomUpVec final : public FunctionPass {
   bool Change = false;
   std::unique_ptr<LegalityAnalysis> Legality;
+  SmallVector<Instruction *> DeadInstrCandidates;
 
   /// Creates and returns a vector instruction that replaces the instructions in
   /// \p Bndl. \p Operands are the already vectorized operands.
   Value *createVectorInstr(ArrayRef<Value *> Bndl, ArrayRef<Value *> Operands);
+  void tryEraseDeadInstrs();
   Value *vectorizeRec(ArrayRef<Value *> Bndl);
   bool tryVectorize(ArrayRef<Value *> Seeds);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 3b648a7e..32afbea 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2044,6 +2044,31 @@ void CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
   Observer.changedInstr(MI);
 }
 
+bool CombinerHelper::matchCombineSubToAdd(MachineInstr &MI,
+                                          BuildFnTy &MatchInfo) {
+  GSub &Sub = cast<GSub>(MI);
+
+  LLT Ty = MRI.getType(Sub.getReg(0));
+
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {Ty}}))
+    return false;
+
+  if (!isConstantLegalOrBeforeLegalizer(Ty))
+    return false;
+
+  APInt Imm = getIConstantFromReg(Sub.getRHSReg(), MRI);
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    auto NegCst = B.buildConstant(Ty, -Imm);
+    Observer.changingInstr(MI);
+    MI.setDesc(B.getTII().get(TargetOpcode::G_ADD));
+    MI.getOperand(2).setReg(NegCst.getReg(0));
+    MI.clearFlag(MachineInstr::MIFlag::NoUWrap);
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
 // shl ([sza]ext x), y => zext (shl x, y), if shift does not overflow source
 bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI,
                                              RegisterImmPair &MatchData) {
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index e2c09fe..10369928 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1487,7 +1487,9 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     LLT SrcTy = MRI->getType(MI->getOperand(NumDsts).getReg());
     if (DstTy.isVector()) {
       // This case is the converse of G_CONCAT_VECTORS.
-      if (!SrcTy.isVector() || SrcTy.getScalarType() != DstTy.getScalarType() ||
+      if (!SrcTy.isVector() ||
+          (SrcTy.getScalarType() != DstTy.getScalarType() &&
+           !SrcTy.isPointerVector()) ||
           SrcTy.isScalableVector() != DstTy.isScalableVector() ||
           SrcTy.getSizeInBits() != NumDsts * DstTy.getSizeInBits())
         report("G_UNMERGE_VALUES source operand does not match vector "
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index fc234581..bf19e81 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -1655,7 +1655,7 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     VerDef.vd_flags = E.Flags.value_or(0);
     VerDef.vd_ndx = E.VersionNdx.value_or(0);
     VerDef.vd_hash = E.Hash.value_or(0);
-    VerDef.vd_aux = sizeof(Elf_Verdef);
+    VerDef.vd_aux = E.VDAux.value_or(sizeof(Elf_Verdef));
     VerDef.vd_cnt = E.VerNames.size();
     if (I == Section.Entries->size() - 1)
       VerDef.vd_next = 0;
@@ -1665,13 +1665,13 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     CBA.write((const char *)&VerDef, sizeof(Elf_Verdef));
 
     for (size_t J = 0; J < E.VerNames.size(); ++J, ++AuxCnt) {
-      Elf_Verdaux VernAux;
-      VernAux.vda_name = DotDynstr.getOffset(E.VerNames[J]);
+      Elf_Verdaux VerdAux;
+      VerdAux.vda_name = DotDynstr.getOffset(E.VerNames[J]);
       if (J == E.VerNames.size() - 1)
-        VernAux.vda_next = 0;
+        VerdAux.vda_next = 0;
       else
-        VernAux.vda_next = sizeof(Elf_Verdaux);
-      CBA.write((const char *)&VernAux, sizeof(Elf_Verdaux));
+        VerdAux.vda_next = sizeof(Elf_Verdaux);
+      CBA.write((const char *)&VerdAux, sizeof(Elf_Verdaux));
     }
   }
 
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index e97248c..bd816a6 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -1921,6 +1921,7 @@ void MappingTraits<ELFYAML::VerdefEntry>::mapping(IO &IO,
   IO.mapOptional("Flags", E.Flags);
   IO.mapOptional("VersionNdx", E.VersionNdx);
   IO.mapOptional("Hash", E.Hash);
+  IO.mapOptional("VDAux", E.VDAux);
   IO.mapRequired("Names", E.VerNames);
 }
 
diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp
index 096b827..df941b2 100644
--- a/llvm/lib/SandboxIR/Instruction.cpp
+++ b/llvm/lib/SandboxIR/Instruction.cpp
@@ -124,10 +124,6 @@ void Instruction::moveBefore(BasicBlock &BB, const BBIterator &WhereIt) {
 
 void Instruction::insertBefore(Instruction *BeforeI) {
   llvm::Instruction *BeforeTopI = BeforeI->getTopmostLLVMInstruction();
-  // TODO: Move this to the verifier of sandboxir::Instruction.
-  assert(is_sorted(getLLVMInstrs(),
-                   [](auto *I1, auto *I2) { return I1->comesBefore(I2); }) &&
-         "Expected program order!");
 
   Ctx.getTracker().emplaceIfTracking<InsertIntoBB>(this);
 
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 0d69bbe..6854ccc 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -63,7 +63,7 @@ def SVE2p1Unsupported : AArch64Unsupported;
 
 def SVE2Unsupported : AArch64Unsupported {
   let F = !listconcat([HasSVE2, HasSVE2orSME, HasSVE2orSME2, HasSSVE_FP8FMA, HasSMEF8F16,
-                       HasSMEF8F32, HasSVEAES, HasSVE2SHA3, HasSVE2SM4, HasSVE2BitPerm,
+                       HasSMEF8F32, HasSVE2AES, HasSVE2SHA3, HasSVE2SM4, HasSVE2BitPerm,
                        HasSVEB16B16],
                        SVE2p1Unsupported.F);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td
index f6dfbe25..61cb4f7 100644
--- a/llvm/lib/Target/AArch64/AArch64FMV.td
+++ b/llvm/lib/Target/AArch64/AArch64FMV.td
@@ -78,7 +78,7 @@ def : FMVExtension<"sme2", "FEAT_SME2", "+sme2,+sme,+bf16", 580>;
 def : FMVExtension<"ssbs", "FEAT_SSBS2", "+ssbs", 490>;
 def : FMVExtension<"sve", "FEAT_SVE", "+sve,+fullfp16,+fp-armv8,+neon", 310>;
 def : FMVExtension<"sve2", "FEAT_SVE2", "+sve2,+sve,+fullfp16,+fp-armv8,+neon", 370>;
-def : FMVExtension<"sve2-aes", "FEAT_SVE_PMULL128", "+sve2,+sve,+aes,+sve-aes,+fullfp16,+fp-armv8,+neon", 380>;
+def : FMVExtension<"sve2-aes", "FEAT_SVE_PMULL128", "+sve2,+sve,+aes,+sve2-aes,+fullfp16,+fp-armv8,+neon", 380>;
 def : FMVExtension<"sve2-bitperm", "FEAT_SVE_BITPERM", "+sve2,+sve,+sve2-bitperm,+fullfp16,+fp-armv8,+neon", 400>;
 def : FMVExtension<"sve2-sha3", "FEAT_SVE_SHA3", "+sve2,+sve,+sve2-sha3,+fullfp16,+fp-armv8,+neon", 410>;
 def : FMVExtension<"sve2-sm4", "FEAT_SVE_SM4", "+sve2,+sve,+sve2-sm4,+fullfp16,+fp-armv8,+neon", 420>;
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index f681f86..6d8554c 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -365,12 +365,9 @@ def FeatureSVE2 : ExtensionWithMArch<"sve2", "SVE2", "FEAT_SVE2",
   "Enable Scalable Vector Extension 2 (SVE2) instructions",
   [FeatureSVE, FeatureUseScalarIncVL]>;
 
-def FeatureSVEAES : ExtensionWithMArch<"sve-aes", "SVEAES",
+def FeatureSVE2AES : ExtensionWithMArch<"sve2-aes", "SVE2AES",
   "FEAT_SVE_AES, FEAT_SVE_PMULL128",
-  "Enable SVE AES and quadword SVE polynomial multiply instructions", [FeatureAES]>;
-
-def AliasSVE2AES : ExtensionWithMArch<"sve2-aes", "ALIAS_SVE2AES", "",
-  "An alias of +sve2+sve-aes", [FeatureSVE2, FeatureSVEAES]>;
+  "Enable AES SVE2 instructions", [FeatureSVE2, FeatureAES]>;
 
 def FeatureSVE2SM4 : ExtensionWithMArch<"sve2-sm4", "SVE2SM4", "FEAT_SVE_SM4",
   "Enable SM4 SVE2 instructions", [FeatureSVE2, FeatureSM4]>;
@@ -541,13 +538,13 @@ def FeatureSME2p2: ExtensionWithMArch<"sme2p2", "SME2p2", "FEAT_SME2p2",
   "Enable Armv9.6-A Scalable Matrix Extension 2.2 instructions", [FeatureSME2p1]>;
 
 def FeatureSSVE_AES : ExtensionWithMArch<"ssve-aes", "SSVE_AES", "FEAT_SSVE_AES",
-  "Enable Armv9.6-A SVE AES support in streaming SVE mode", [FeatureSME2, FeatureSVEAES]>;
+  "Enable Armv9.6-A SVE2 AES support in streaming SVE mode", [FeatureSME2, FeatureSVE2AES]>;
 
 def FeatureSVE2p2 : ExtensionWithMArch<"sve2p2", "SVE2p2", "FEAT_SVE2p2",
   "Enable Armv9.6-A Scalable Vector Extension 2.2 instructions", [FeatureSVE2p1]>;
 
 def FeatureSVEAES2: ExtensionWithMArch<"sve-aes2", "SVE_AES2", "FEAT_SVE_AES2",
-  "Enable Armv9.6-A SVE multi-vector AES and multi-vector quadword polynomial multiply instructions">;
+  "Enable Armv9.6-A SVE multi-vector AES and 128-bit PMULL instructions">;
 
 def FeatureSVEBFSCALE: ExtensionWithMArch<"sve-bfscale", "SVE_BFSCALE", "FEAT_SVE_BFSCALE",
   "Enable Armv9.6-A SVE BFloat16 scaling instructions">;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index e44caef..b5f6388 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5821,6 +5821,13 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
                        (f16 FPR16:$Ra))),
             (!cast<Instruction>(NAME # Hrrr)
               (f16 (EXTRACT_SUBREG V128:$Rn, hsub)), FPR16:$Rm, FPR16:$Ra)>;
+
+  def : Pat<(f16 (node (f16 (extractelt (v8f16 V128:$Rn), (i64 0))),
+                       (f16 (extractelt (v8f16 V128:$Rm), (i64 0))),
+                       (f16 FPR16:$Ra))),
+            (!cast<Instruction>(NAME # Hrrr)
+              (f16 (EXTRACT_SUBREG V128:$Rn, hsub)),
+              (f16 (EXTRACT_SUBREG V128:$Rm, hsub)), FPR16:$Ra)>;
   }
 
   def : Pat<(f32 (node (f32 FPR32:$Rn),
@@ -5835,6 +5842,13 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
             (!cast<Instruction>(NAME # Srrr)
               (EXTRACT_SUBREG V128:$Rn, ssub), FPR32:$Rm, FPR32:$Ra)>;
 
+  def : Pat<(f32 (node (f32 (extractelt (v4f32 V128:$Rn), (i64 0))),
+                       (f32 (extractelt (v4f32 V128:$Rm), (i64 0))),
+                       (f32 FPR32:$Ra))),
+            (!cast<Instruction>(NAME # Srrr)
+              (EXTRACT_SUBREG V128:$Rn, ssub),
+              (EXTRACT_SUBREG V128:$Rm, ssub), FPR32:$Ra)>;
+
   def : Pat<(f64 (node (f64 FPR64:$Rn),
                        (f64 (extractelt (v2f64 V128:$Rm), (i64 0))),
                        (f64 FPR64:$Ra))),
@@ -5846,6 +5860,13 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
                        (f64 FPR64:$Ra))),
             (!cast<Instruction>(NAME # Drrr)
               (EXTRACT_SUBREG V128:$Rn, dsub), FPR64:$Rm, FPR64:$Ra)>;
+
+  def : Pat<(f64 (node (f64 (extractelt (v2f64 V128:$Rn), (i64 0))),
+                       (f64 (extractelt (v2f64 V128:$Rm), (i64 0))),
+                       (f64 FPR64:$Ra))),
+            (!cast<Instruction>(NAME # Drrr)
+              (EXTRACT_SUBREG V128:$Rn, dsub),
+              (EXTRACT_SUBREG V128:$Rm, dsub), FPR64:$Ra)>;
 }
 
 //---
@@ -9282,6 +9303,11 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
                          (vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
             (!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn,
                 V128_lo:$Rm, VectorIndexH:$idx)>;
+  def : Pat<(f16 (OpNode (f16 FPR16:$Rd),
+                         (vector_extract (v8f16 V128:$Rn), (i64 0)),
+                         (vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
+            (!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd,
+                (f16 (EXTRACT_SUBREG V128:$Rn, hsub)), V128_lo:$Rm, VectorIndexH:$idx)>;
   } // Predicates = [HasNEON, HasFullFP16]
 
   // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
@@ -9323,12 +9349,22 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
                          (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
             (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
                 V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd),
+                         (vector_extract (v4f32 V128:$Rn), (i64 0)),
+                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd,
+                (f32 (EXTRACT_SUBREG V128:$Rn, ssub)), V128:$Rm, VectorIndexS:$idx)>;
 
   // 1 variant for 64-bit scalar version: extract from .1d or from .2d
   def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
                          (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
             (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
                 V128:$Rm, VectorIndexD:$idx)>;
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd),
+                         (vector_extract (v2f64 V128:$Rn), (i64 0)),
+                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd,
+                (f64 (EXTRACT_SUBREG V128:$Rn, dsub)), V128:$Rm, VectorIndexD:$idx)>;
 }
 
 let mayRaiseFPException = 1, Uses = [FPCR] in
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 57a8c36..a31fd6c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -149,8 +149,8 @@ def HasSVE2          : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasS
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">;
 def HasSVE2p1        : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2p1), "sve2p1">;
-def HasSVEAES        : Predicate<"Subtarget->hasSVEAES()">,
-                                 AssemblerPredicateWithAll<(all_of FeatureSVEAES), "sve-aes">;
+def HasSVE2AES       : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2AES()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE2AES), "sve2-aes">;
 def HasSVE2SM4       : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SM4()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">;
 def HasSVE2SHA3      : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SHA3()">,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index c244b8e..c10653e 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3906,7 +3906,7 @@ let Predicates = [HasSVE2orSME] in {
   defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
 } // End HasSVE2orSME
 
-let Predicates = [HasSVE2, HasSVEAES] in {
+let Predicates = [HasSVE2AES] in {
   // SVE2 crypto destructive binary operations
   defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>;
   defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>;
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 5359e31..712f6de5 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3737,9 +3737,8 @@ static const struct Extension {
     {"rng", {AArch64::FeatureRandGen}},
     {"sve", {AArch64::FeatureSVE}},
     {"sve-b16b16", {AArch64::FeatureSVEB16B16}},
-    {"sve-aes", {AArch64::FeatureSVEAES}},
     {"sve2", {AArch64::FeatureSVE2}},
-    {"sve2-aes", {AArch64::AliasSVE2AES}},
+    {"sve2-aes", {AArch64::FeatureSVE2AES}},
     {"sve2-sm4", {AArch64::FeatureSVE2SM4}},
     {"sve2-sha3", {AArch64::FeatureSVE2SHA3}},
     {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}},
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0658e03..755cbb7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3554,7 +3554,7 @@ def : AMDGPUPat <
 >;
 
 def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
-  return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxTrailingOnes() <= 5;
+  return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxActiveBits() <= 5;
 }]>;
 
 // x << (bitwidth - y) >> (bitwidth - y)
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index be0a612..cf8e337 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -314,6 +314,19 @@ static DecodeStatus decodeUImmOperand(MCInst &Inst, uint32_t Imm,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeUImmLog2XLenOperand(MCInst &Inst, uint32_t Imm,
+                                              int64_t Address,
+                                              const MCDisassembler *Decoder) {
+  assert(isUInt<6>(Imm) && "Invalid immediate");
+
+  if (!Decoder->getSubtargetInfo().hasFeature(RISCV::Feature64Bit) &&
+      !isUInt<5>(Imm))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createImm(Imm));
+  return MCDisassembler::Success;
+}
+
 template <unsigned N>
 static DecodeStatus decodeUImmNonZeroOperand(MCInst &Inst, uint32_t Imm,
                                              int64_t Address,
@@ -323,6 +336,14 @@ static DecodeStatus decodeUImmNonZeroOperand(MCInst &Inst, uint32_t Imm,
   return decodeUImmOperand<N>(Inst, Imm, Address, Decoder);
 }
 
+static DecodeStatus
+decodeUImmLog2XLenNonZeroOperand(MCInst &Inst, uint32_t Imm, int64_t Address,
+                                 const MCDisassembler *Decoder) {
+  if (Imm == 0)
+    return MCDisassembler::Fail;
+  return decodeUImmLog2XLenOperand(Inst, Imm, Address, Decoder);
+}
+
 template <unsigned N>
 static DecodeStatus decodeSImmOperand(MCInst &Inst, uint32_t Imm,
                                       int64_t Address,
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index ba4442f..9135377 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -456,11 +456,17 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   }
 
   if (ST.hasStdExtM()) {
-    getActionDefinitionsBuilder({G_UDIV, G_SDIV, G_UREM, G_SREM})
-        .legalFor({s32, sXLen})
+    getActionDefinitionsBuilder({G_UDIV, G_SDIV, G_UREM})
+        .legalFor({sXLen})
+        .customFor({s32})
         .libcallFor({sDoubleXLen})
         .clampScalar(0, s32, sDoubleXLen)
         .widenScalarToNextPow2(0);
+    getActionDefinitionsBuilder(G_SREM)
+        .legalFor({sXLen})
+        .libcallFor({sDoubleXLen})
+        .clampScalar(0, sXLen, sDoubleXLen)
+        .widenScalarToNextPow2(0);
   } else {
     getActionDefinitionsBuilder({G_UDIV, G_SDIV, G_UREM, G_SREM})
         .libcallFor({sXLen, sDoubleXLen})
@@ -1165,6 +1171,12 @@ static unsigned getRISCVWOpcode(unsigned Opcode) {
   switch (Opcode) {
   default:
     llvm_unreachable("Unexpected opcode");
+  case TargetOpcode::G_SDIV:
+    return RISCV::G_DIVW;
+  case TargetOpcode::G_UDIV:
+    return RISCV::G_DIVUW;
+  case TargetOpcode::G_UREM:
+    return RISCV::G_REMUW;
   case TargetOpcode::G_ROTL:
     return RISCV::G_ROLW;
   case TargetOpcode::G_ROTR:
@@ -1216,6 +1228,9 @@ bool RISCVLegalizerInfo::legalizeCustom(
     return Helper.lower(MI, 0, /* Unused hint type */ LLT()) ==
            LegalizerHelper::Legalized;
   }
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_UREM:
   case TargetOpcode::G_ROTL:
   case TargetOpcode::G_ROTR: {
     Helper.Observer.changingInstr(MI);
diff --git a/llvm/lib/Target/RISCV/RISCVCombine.td b/llvm/lib/Target/RISCV/RISCVCombine.td
index 60d9429..69043e9 100644
--- a/llvm/lib/Target/RISCV/RISCVCombine.td
+++ b/llvm/lib/Target/RISCV/RISCVCombine.td
@@ -23,6 +23,7 @@ def RISCVO0PreLegalizerCombiner: GICombiner<
 // TODO: Add more combines.
 def RISCVPostLegalizerCombiner
     : GICombiner<"RISCVPostLegalizerCombinerImpl",
-                 [combines_for_extload, redundant_and, identity_combines,
-                  commute_constant_to_rhs, constant_fold_cast_op]> {
+                 [sub_to_add, combines_for_extload, redundant_and,
+                  identity_combines, commute_constant_to_rhs,
+                  constant_fold_cast_op]> {
 }
diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 10906aeb..c0af1d6 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -106,15 +106,6 @@ def gi_zexti16 : GIComplexOperandMatcher<s32, "selectZExtBits<16>">,
 def gi_zexti8  : GIComplexOperandMatcher<s32, "selectZExtBits<8>">,
                  GIComplexPatternEquiv<zexti8>;
 
-// FIXME: Canonicalize (sub X, C) -> (add X, -C) earlier.
-def : Pat<(XLenVT (sub GPR:$rs1, simm12Plus1:$imm)),
-          (ADDI GPR:$rs1, (NegImm simm12Plus1:$imm))>;
-
-let Predicates = [IsRV64] in {
-def : Pat<(i32 (sub GPR:$rs1, simm12Plus1i32:$imm)),
-          (ADDIW GPR:$rs1, (i64 (NegImm $imm)))>;
-}
-
 // Ptr type used in patterns with GlobalISelEmitter
 def PtrVT : PtrValueTypeByHwMode<XLenVT, 0>;
 
@@ -255,13 +246,6 @@ let Predicates = [HasStdExtZmmul, IsRV64] in {
 def : PatGprGpr<mul, MULW, i32, i32>;
 }
 
-let Predicates = [HasStdExtM, IsRV64] in {
-def : PatGprGpr<sdiv, DIVW, i32, i32>;
-def : PatGprGpr<udiv, DIVUW, i32, i32>;
-def : PatGprGpr<srem, REMW, i32, i32>;
-def : PatGprGpr<urem, REMUW, i32, i32>;
-}
-
 //===----------------------------------------------------------------------===//
 // Zb* RV64 i32 patterns not used by SelectionDAG.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrGISel.td b/llvm/lib/Target/RISCV/RISCVInstrGISel.td
index 32e6397..bf2f866 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrGISel.td
@@ -17,6 +17,30 @@ class RISCVGenericInstruction : GenericInstruction {
   let Namespace = "RISCV";
 }
 
+// Pseudo equivalent to a RISCVISD::DIVW.
+def G_DIVW : RISCVGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = false;
+}
+def : GINodeEquiv<G_DIVW, riscv_divw>;
+
+// Pseudo equivalent to a RISCVISD::DIVUW.
+def G_DIVUW : RISCVGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = false;
+}
+def : GINodeEquiv<G_DIVUW, riscv_divuw>;
+
+// Pseudo equivalent to a RISCVISD::REMUW.
+def G_REMUW : RISCVGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = false;
+}
+def : GINodeEquiv<G_REMUW, riscv_remuw>;
+
 // Pseudo equivalent to a RISCVISD::RORW.
 def G_RORW : RISCVGenericInstruction {
   let OutOperandList = (outs type0:$dst);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index a867368..1908f5e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -201,8 +201,7 @@ def uimmlog2xlen : RISCVOp, ImmLeaf<XLenVT, [{
   return isUInt<5>(Imm);
 }]> {
   let ParserMatchClass = UImmLog2XLenAsmOperand;
-  // TODO: should ensure invalid shamt is rejected when decoding.
-  let DecoderMethod = "decodeUImmOperand<6>";
+  let DecoderMethod = "decodeUImmLog2XLenOperand";
   let MCOperandPredicate = [{
     int64_t Imm;
     if (!MCOp.evaluateAsConstantImm(Imm))
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 8a76dba..e5a5f60 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -24,8 +24,7 @@ def uimmlog2xlennonzero : RISCVOp, ImmLeaf<XLenVT, [{
   return isUInt<5>(Imm) && (Imm != 0);
 }]> {
   let ParserMatchClass = UImmLog2XLenNonZeroAsmOperand;
-  // TODO: should ensure invalid shamt is rejected when decoding.
-  let DecoderMethod = "decodeUImmNonZeroOperand<6>";
+  let DecoderMethod = "decodeUImmLog2XLenNonZeroOperand";
   let OperandType = "OPERAND_UIMMLOG2XLEN_NONZERO";
   let MCOperandPredicate = [{
     int64_t Imm;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index e583660..d6a59ad 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -20,7 +20,7 @@
 
 using namespace llvm;
 
-static const std::map<std::string, SPIRV::Extension::Extension>
+static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
     SPIRVExtensionMap = {
         {"SPV_EXT_shader_atomic_float_add",
          SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_add},
@@ -98,7 +98,7 @@ bool SPIRVExtensionsParser::parse(cl::Option &O, llvm::StringRef ArgName,
       return O.error("Invalid extension list format: " + Token.str());
 
     llvm::StringRef ExtensionName = Token.substr(1);
-    auto NameValuePair = SPIRVExtensionMap.find(ExtensionName.str());
+    auto NameValuePair = SPIRVExtensionMap.find(ExtensionName);
 
     if (NameValuePair == SPIRVExtensionMap.end())
       return O.error("Unknown SPIR-V extension: " + Token.str());
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1ab879e..19a85a6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46183,11 +46183,17 @@ static SDValue combineToExtendBoolVectorInReg(
     assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
     unsigned Scale = NumElts / EltSizeInBits;
     EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
-    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+    bool UseBroadcast = Subtarget.hasInt256() &&
+                        (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
+    Vec = UseBroadcast
+              ? DAG.getSplat(BroadcastVT, DL, N00)
+              : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
     Vec = DAG.getBitcast(VT, Vec);
 
-    for (unsigned i = 0; i != Scale; ++i)
-      ShuffleMask.append(EltSizeInBits, i);
+    for (unsigned i = 0; i != Scale; ++i) {
+      int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
+      ShuffleMask.append(EltSizeInBits, i + Offset);
+    }
     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
   } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
              (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
@@ -46196,21 +46202,14 @@ static SDValue combineToExtendBoolVectorInReg(
     // widened bits won't be used, and this might allow the use of a broadcast
     // load.
     assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
-    unsigned Scale = EltSizeInBits / NumElts;
-    EVT BroadcastVT =
-        EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
-    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
-    ShuffleMask.append(NumElts * Scale, 0);
-    Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
-    Vec = DAG.getBitcast(VT, Vec);
+    EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
+                                       (NumElts * EltSizeInBits) / NumElts);
+    Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
   } else {
     // For smaller scalar integers, we can simply any-extend it to the vector
     // element size (we don't care about the upper bits) and broadcast it to all
     // elements.
-    SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
-    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
-    ShuffleMask.append(NumElts, 0);
-    Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+    Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
   }
 
   // Now, mask the relevant bit in each element.
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index e742d2e..bc50f23 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -874,6 +874,8 @@ static Value *NegateValue(Value *V, Instruction *BI,
   // negation.
   Instruction *NewNeg =
       CreateNeg(V, V->getName() + ".neg", BI->getIterator(), BI);
+  // NewNeg is generated to potentially replace BI, so use its DebugLoc.
+  NewNeg->setDebugLoc(BI->getDebugLoc());
   ToRedo.insert(NewNeg);
   return NewNeg;
 }
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index ef7ef8e..4a39a48 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -577,8 +577,15 @@ static Value *findBaseDefiningValue(Value *I, DefiningValueMapTy &Cache,
     return I;
   }
 
-  assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are "
-                                   "binary ops which don't apply to pointers");
+  if (auto *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    assert(RMWI->getOperation() == AtomicRMWInst::Xchg &&
+           "Only Xchg is allowed for pointer values");
+    // A RMW Xchg is a combined atomic load and store, so we can treat the
+    // loaded value as a base pointer.
+    Cache[I] = I;
+    setKnownBase(I, /* IsKnownBase */ true, KnownBases);
+    return I;
+  }
 
   // The aggregate ops.  Aggregates can either be in the heap or on the
   // stack, but in either case, this is simply a field load.  As a result,
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 67f9add..56084bb 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -619,7 +619,6 @@ void StructurizeCFG::insertConditions(bool Loops) {
     BasicBlock *SuccFalse = Term->getSuccessor(1);
 
     PhiInserter.Initialize(Boolean, "");
-    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
     PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
 
     BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index ad5670a..85d4415 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -880,7 +880,8 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     DeadSucc->removePredecessor(Src, /* KeepOneInputPHIs */ true);
 
     // Replace the conditional branch with an unconditional one.
-    BranchInst::Create(Dest, Term->getIterator());
+    auto *BI = BranchInst::Create(Dest, Term->getIterator());
+    BI->setDebugLoc(Term->getDebugLoc());
     Term->eraseFromParent();
 
     DTUpdates.emplace_back(DominatorTree::Delete, Src, DeadSucc);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c07af85..82d85b1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7521,6 +7521,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   assert((BestFactor.Width == LegacyVF.Width ||
           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
+                                                CostCtx, OrigLoop) ||
+          planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
                                                 CostCtx, OrigLoop)) &&
          " VPlan cost model and legacy cost model disagreed");
   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c5b3537..4a73b9c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -832,6 +832,7 @@ struct InstructionsState {
   InstructionsState() = delete;
   InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
       : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
+  static InstructionsState invalid() { return {nullptr, nullptr, nullptr}; }
 };
 
 } // end anonymous namespace
@@ -891,20 +892,19 @@ static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
 /// could be vectorized even if its structure is diverse.
 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                                        const TargetLibraryInfo &TLI) {
-  constexpr unsigned BaseIndex = 0;
   // Make sure these are all Instructions.
-  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
-    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+  if (!all_of(VL, IsaPred<Instruction>))
+    return InstructionsState::invalid();
 
-  bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
-  bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
-  bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
+  Value *V = VL.front();
+  bool IsCastOp = isa<CastInst>(V);
+  bool IsBinOp = isa<BinaryOperator>(V);
+  bool IsCmpOp = isa<CmpInst>(V);
   CmpInst::Predicate BasePred =
-      IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
-              : CmpInst::BAD_ICMP_PREDICATE;
-  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+      IsCmpOp ? cast<CmpInst>(V)->getPredicate() : CmpInst::BAD_ICMP_PREDICATE;
+  unsigned Opcode = cast<Instruction>(V)->getOpcode();
   unsigned AltOpcode = Opcode;
-  unsigned AltIndex = BaseIndex;
+  unsigned AltIndex = 0;
 
   bool SwappedPredsCompatible = [&]() {
     if (!IsCmpOp)
@@ -931,14 +931,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
   }();
   // Check for one alternate opcode from another BinaryOperator.
   // TODO - generalize to support all operators (types, calls etc.).
-  auto *IBase = cast<Instruction>(VL[BaseIndex]);
+  auto *IBase = cast<Instruction>(V);
   Intrinsic::ID BaseID = 0;
   SmallVector<VFInfo> BaseMappings;
   if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
     BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
     BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
     if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
-      return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+      return InstructionsState::invalid();
   }
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
     auto *I = cast<Instruction>(VL[Cnt]);
@@ -970,7 +970,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
         }
       }
     } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
-      auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
+      auto *BaseInst = cast<CmpInst>(V);
       Type *Ty0 = BaseInst->getOperand(0)->getType();
       Type *Ty1 = Inst->getOperand(0)->getType();
       if (Ty0 == Ty1) {
@@ -988,7 +988,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
         if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
           continue;
         auto *AltInst = cast<CmpInst>(VL[AltIndex]);
-        if (AltIndex != BaseIndex) {
+        if (AltIndex) {
           if (isCmpSameOrSwapped(AltInst, Inst, TLI))
             continue;
         } else if (BasePred != CurrentPred) {
@@ -1007,27 +1007,28 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
       if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
         if (Gep->getNumOperands() != 2 ||
             Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
-          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+          return InstructionsState::invalid();
       } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
         if (!isVectorLikeInstWithConstOps(EI))
-          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+          return InstructionsState::invalid();
       } else if (auto *LI = dyn_cast<LoadInst>(I)) {
         auto *BaseLI = cast<LoadInst>(IBase);
         if (!LI->isSimple() || !BaseLI->isSimple())
-          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+          return InstructionsState::invalid();
       } else if (auto *Call = dyn_cast<CallInst>(I)) {
         auto *CallBase = cast<CallInst>(IBase);
         if (Call->getCalledFunction() != CallBase->getCalledFunction())
-          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
-        if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() ||
-            !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
-                        Call->op_begin() + Call->getBundleOperandsEndIndex(),
-                        CallBase->op_begin() +
-                            CallBase->getBundleOperandsStartIndex())))
-          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+          return InstructionsState::invalid();
+        if (Call->hasOperandBundles() &&
+            (!CallBase->hasOperandBundles() ||
+             !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
+                         Call->op_begin() + Call->getBundleOperandsEndIndex(),
+                         CallBase->op_begin() +
+                             CallBase->getBundleOperandsStartIndex())))
+          return InstructionsState::invalid();
         Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI);
         if (ID != BaseID)
-          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+          return InstructionsState::invalid();
         if (!ID) {
           SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
           if (Mappings.size() != BaseMappings.size() ||
@@ -1037,15 +1038,15 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
               Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
               Mappings.front().Shape.Parameters !=
                   BaseMappings.front().Shape.Parameters)
-            return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+            return InstructionsState::invalid();
         }
       }
       continue;
     }
-    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+    return InstructionsState::invalid();
   }
 
-  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
+  return InstructionsState(V, cast<Instruction>(V),
                            cast<Instruction>(VL[AltIndex]));
 }
 
@@ -2241,10 +2242,9 @@ public:
           HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
         } else if (NumFreeOpsHash.NumOfAPOs == Min &&
                    NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
-          auto *It = HashMap.find(NumFreeOpsHash.Hash);
-          if (It == HashMap.end())
-            HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
-          else
+          auto [It, Inserted] =
+              HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
+          if (!Inserted)
             ++It->second.first;
         }
       }
@@ -8020,7 +8020,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
 
   // Don't handle vectors.
-  if (!SLPReVec && getValueType(S.OpValue)->isVectorTy()) {
+  if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
     return;
@@ -8089,7 +8089,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
   bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
   bool AreScatterAllGEPSameBlock =
-      (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
+      (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
        VL.size() > 2 &&
        all_of(VL,
               [&BB](Value *V) {
@@ -8105,7 +8105,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                        SortedIndices));
   bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
   if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) ||
-      (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
+      (isa_and_present<InsertElementInst, ExtractValueInst, ExtractElementInst>(
            S.OpValue) &&
        !all_of(VL, isVectorLikeInstWithConstOps)) ||
       NotProfitableForVectorization(VL)) {
@@ -8162,7 +8162,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Special processing for sorted pointers for ScatterVectorize node with
   // constant indeces only.
   if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
-    assert(S.OpValue->getType()->isPointerTy() &&
+    assert(VL.front()->getType()->isPointerTy() &&
            count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
            "Expected pointers only.");
     // Reset S to make it GetElementPtr kind of node.
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index 0a930d3..3617d36 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -153,6 +153,17 @@ Value *BottomUpVec::createVectorInstr(ArrayRef<Value *> Bndl,
   // TODO: Propagate debug info.
 }
 
+void BottomUpVec::tryEraseDeadInstrs() {
+  // Visiting the dead instructions bottom-to-top.
+  sort(DeadInstrCandidates,
+       [](Instruction *I1, Instruction *I2) { return I1->comesBefore(I2); });
+  for (Instruction *I : reverse(DeadInstrCandidates)) {
+    if (I->hasNUses(0))
+      I->eraseFromParent();
+  }
+  DeadInstrCandidates.clear();
+}
+
 Value *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl) {
   Value *NewVec = nullptr;
   const auto &LegalityRes = Legality->canVectorize(Bndl);
@@ -182,7 +193,11 @@ Value *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl) {
     }
     NewVec = createVectorInstr(Bndl, VecOperands);
 
-    // TODO: Collect potentially dead instructions.
+    // Collect the original scalar instructions as they may be dead.
+    if (NewVec != nullptr) {
+      for (Value *V : Bndl)
+        DeadInstrCandidates.push_back(cast<Instruction>(V));
+    }
     break;
   }
   case LegalityResultID::Pack: {
@@ -194,7 +209,9 @@ Value *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl) {
 }
 
 bool BottomUpVec::tryVectorize(ArrayRef<Value *> Bndl) {
+  DeadInstrCandidates.clear();
   vectorizeRec(Bndl);
+  tryEraseDeadInstrs();
   return Change;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 00ba2f4..08db0d5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -552,18 +552,10 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
          "can only split at a position in the same block");
 
   SmallVector<VPBlockBase *, 2> Succs(successors());
-  // First, disconnect the current block from its successors.
-  for (VPBlockBase *Succ : Succs)
-    VPBlockUtils::disconnectBlocks(this, Succ);
-
   // Create new empty block after the block to split.
   auto *SplitBlock = new VPBasicBlock(getName() + ".split");
   VPBlockUtils::insertBlockAfter(SplitBlock, this);
 
-  // Add successors for block to split to new block.
-  for (VPBlockBase *Succ : Succs)
-    VPBlockUtils::connectBlocks(SplitBlock, Succ);
-
   // Finally, move the recipes starting at SplitAt to new block.
   for (VPRecipeBase &ToMove :
        make_early_inc_range(make_range(SplitAt, this->end())))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6254ea1..ef2ca9a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1484,6 +1484,8 @@ void VPWidenCastRecipe::execute(VPTransformState &State) {
   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
   State.set(this, Cast);
   State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
+  if (auto *CastOp = dyn_cast<Instruction>(Cast))
+    setFlags(CastOp);
 }
 
 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir
index 2f10a49..5cbff0f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir
@@ -308,8 +308,8 @@ body:             |
     ; CHECK: liveins: $w0, $w1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %a:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 71
-    ; CHECK-NEXT: %sub:_(s64) = G_SUB %a, [[C]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -71
+    ; CHECK-NEXT: %sub:_(s64) = G_ADD %a, [[C]]
     ; CHECK-NEXT: $x0 = COPY %sub(s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %a:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir
index f207e9c..e9d4af7 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir
@@ -88,8 +88,8 @@ body:             |
     ; CHECK-LABEL: name: test_combine_trunc_sub_i128
     ; CHECK: %lhs:_(s128) = COPY $q0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s128)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
-    ; CHECK-NEXT: %small:_(s32) = G_SUB [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -5
+    ; CHECK-NEXT: %small:_(s32) = G_ADD [[TRUNC]], [[C]]
     ; CHECK-NEXT: $w0 = COPY %small(s32)
     %lhs:_(s128) = COPY $q0
     %rhs:_(s128) = G_CONSTANT i128 5
@@ -103,8 +103,8 @@ body:             |
   bb.1:
     ; CHECK-LABEL: name: test_combine_trunc_sub_i128_multi_use
     ; CHECK: %lhs:_(s128) = COPY $q0
-    ; CHECK-NEXT: %rhs:_(s128) = G_CONSTANT i128 5
-    ; CHECK-NEXT: %res:_(s128) = G_SUB %lhs, %rhs
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s128) = G_CONSTANT i128 -5
+    ; CHECK-NEXT: %res:_(s128) = G_ADD %lhs, [[C]]
     ; CHECK-NEXT: %small:_(s32) = G_TRUNC %res(s128)
     ; CHECK-NEXT: $q0 = COPY %res(s128)
     ; CHECK-NEXT: $w0 = COPY %small(s32)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir
index 04968dab..591b6a1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir
@@ -95,7 +95,7 @@ body: |
     %11:_(s8) = G_CONSTANT i8 1
     ; CHECK: [[T3:%[0-9]+]]:_(s8) = G_TRUNC [[T0]](s32)
     %7:_(s8) = G_SUB %2, %11
-    ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_SUB [[T3]], {{.*}}
+    ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_ADD [[T3]], {{.*}}
     G_BR %bb.3.exit
   bb.3.exit:
   ; CHECK: bb.3.exit:
@@ -197,7 +197,7 @@ body: |
     %7:_(s8) = G_CONSTANT i8 1
     ; CHECK: [[T3:%[0-9]+]]:_(s8) = G_TRUNC [[T0]](s32)
     %8:_(s8) = G_SUB %2, %7
-    ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_SUB [[T3]], {{.*}}
+    ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_ADD [[T3]], {{.*}}
     G_BR %bb.3.exit
   bb.3.exit:
   ; CHECK: bb.3.exit:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir
index 0900dd4..4c3faa9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir
@@ -289,8 +289,8 @@ body:             |
     ; CHECK: liveins: $w0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %x:_(s32) = COPY $w0
-    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: %op:_(s32) = G_SUB %x, %cst
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: %op:_(s32) = G_ADD %x, [[C]]
     ; CHECK-NEXT: $w0 = COPY %op(s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %x:_(s32) = COPY $w0
@@ -488,3 +488,66 @@ body:             |
     RET_ReallyLR implicit $w0
 
 ...
+---
+name:            sub_to_add
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0
+    ; CHECK-LABEL: name: sub_to_add
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: %op:_(s32) = G_ADD %x, [[C]]
+    ; CHECK-NEXT: $w0 = COPY %op(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %op:_(s32) = G_SUB %x(s32), %cst
+    $w0 = COPY %op(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            sub_to_add_nuw
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0
+    ; CHECK-LABEL: name: sub_to_add_nuw
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: %op:_(s32) = G_ADD %x, [[C]]
+    ; CHECK-NEXT: $w0 = COPY %op(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %op:_(s32) = nuw G_SUB %x(s32), %cst
+    $w0 = COPY %op(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            sub_to_add_nsw
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0
+    ; CHECK-LABEL: name: sub_to_add_nsw
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: %op:_(s32) = nsw G_ADD %x, [[C]]
+    ; CHECK-NEXT: $w0 = COPY %op(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %op:_(s32) = nsw G_SUB %x(s32), %cst
+    $w0 = COPY %op(s32)
+    RET_ReallyLR implicit $w0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/translate-gep.ll b/llvm/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
index a916fb2..29763f2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
@@ -145,3 +145,24 @@ entry:
   %0 = getelementptr inbounds [8 x i32], ptr @arr, i64 0, <2 x i64> %offs
   ret <2 x ptr> %0
 }
+
+define <4 x ptr> @vector_gep_v4i32(<4 x ptr> %b, <4 x i32> %off) {
+  ; CHECK-LABEL: name: vector_gep_v4i32
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $q0, $q1, $q2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x p0>) = G_CONCAT_VECTORS [[COPY]](<2 x s64>), [[COPY1]](<2 x s64>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $q2
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(<4 x s64>) = G_SEXT [[COPY2]](<4 x s32>)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(<4 x p0>) = G_PTR_ADD [[CONCAT_VECTORS]], [[SEXT]](<4 x s64>)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<4 x p0>) = COPY [[PTR_ADD]](<4 x p0>)
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s64>), [[UV1:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[COPY3]](<4 x p0>)
+  ; CHECK-NEXT:   $q0 = COPY [[UV]](<2 x s64>)
+  ; CHECK-NEXT:   $q1 = COPY [[UV1]](<2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $q0, implicit $q1
+entry:
+  %g = getelementptr i8, <4 x ptr> %b, <4 x i32> %off
+  ret <4 x ptr> %g
+}
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
index fbe913e..afcdb76 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
@@ -11,10 +11,10 @@ define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
 ; CHECK-NEXT:    mov h2, v0.h[1]
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    fmul h3, h0, v1.h[1]
-; CHECK-NEXT:    fmul h4, h2, v1.h[1]
-; CHECK-NEXT:    fmadd h2, h1, h2, h3
-; CHECK-NEXT:    fnmsub h0, h1, h0, h4
-; CHECK-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-NEXT:    fmul h2, h2, v1.h[1]
+; CHECK-NEXT:    fmla h3, h1, v0.h[1]
+; CHECK-NEXT:    fnmsub h0, h1, h0, h2
+; CHECK-NEXT:    mov v0.h[1], v3.h[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
index 725c44c..368683e2 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -120,8 +120,7 @@ define half @t_vfmah_lane_f16_3_0(half %a, <4 x half> %c) {
 ; CHECK-LABEL: t_vfmah_lane_f16_3_0:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov h2, v1.h[3]
-; CHECK-NEXT:    fmadd h0, h1, h2, h0
+; CHECK-NEXT:    fmla h0, h1, v1.h[3]
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <4 x half> %c, i32 0
@@ -310,8 +309,7 @@ define half @t_vfmsh_lane_f16_0_3(half %a, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_lane_f16_0_3:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov h2, v1.h[3]
-; CHECK-NEXT:    fmsub h0, h2, h1, h0
+; CHECK-NEXT:    fmls h0, h1, v1.h[3]
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <4 x half> %c, i32 0
diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index b2ea6ff..544d768 100644
--- a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -84,8 +84,7 @@ define float @test_fmla_ss2S_1(float %a, float %b, <2 x float> %v) {
 define float @test_fmla_ss4S_3_ext0(float %a, <4 x float> %v) {
 ; CHECK-LABEL: test_fmla_ss4S_3_ext0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov s2, v1.s[3]
-; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    fmla s0, s1, v1.s[3]
 ; CHECK-NEXT:    ret
   %tmp0 = extractelement <4 x float> %v, i32 0
   %tmp1 = extractelement <4 x float> %v, i32 3
@@ -96,8 +95,7 @@ define float @test_fmla_ss4S_3_ext0(float %a, <4 x float> %v) {
 define float @test_fmla_ss4S_3_ext0_swp(float %a, <4 x float> %v) {
 ; CHECK-LABEL: test_fmla_ss4S_3_ext0_swp:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov s2, v1.s[3]
-; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    fmla s0, s1, v1.s[3]
 ; CHECK-NEXT:    ret
   %tmp0 = extractelement <4 x float> %v, i32 0
   %tmp1 = extractelement <4 x float> %v, i32 3
@@ -120,8 +118,7 @@ define float @test_fmla_ss2S_3_ext0(float %a, <2 x float> %v) {
 ; CHECK-LABEL: test_fmla_ss2S_3_ext0:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov s2, v1.s[1]
-; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    fmla s0, s1, v1.s[1]
 ; CHECK-NEXT:    ret
   %tmp0 = extractelement <2 x float> %v, i32 0
   %tmp1 = extractelement <2 x float> %v, i32 1
@@ -133,8 +130,7 @@ define float @test_fmla_ss2S_3_ext0_swp(float %a, <2 x float> %v) {
 ; CHECK-LABEL: test_fmla_ss2S_3_ext0_swp:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov s2, v1.s[1]
-; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    fmla s0, s1, v1.s[1]
 ; CHECK-NEXT:    ret
   %tmp0 = extractelement <2 x float> %v, i32 0
   %tmp1 = extractelement <2 x float> %v, i32 1
@@ -218,8 +214,7 @@ define double @test_fmla_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
 define double @test_fmla_ss2D_1_ext0(double %a, <2 x double> %v) {
 ; CHECK-LABEL: test_fmla_ss2D_1_ext0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d2, v1.d[1]
-; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    fmla d0, d1, v1.d[1]
 ; CHECK-NEXT:    ret
   %tmp0 = extractelement <2 x double> %v, i32 0
   %tmp1 = extractelement <2 x double> %v, i32 1
@@ -230,8 +225,7 @@ define double @test_fmla_ss2D_1_ext0(double %a, <2 x double> %v) {
 define double @test_fmla_ss2D_1_ext0_swp(double %a, <2 x double> %v) {
 ; CHECK-LABEL: test_fmla_ss2D_1_ext0_swp:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d2, v1.d[1]
-; CHECK-NEXT:    fmadd d0, d2, d1, d0
+; CHECK-NEXT:    fmla d0, d1, v1.d[1]
 ; CHECK-NEXT:    ret
   %tmp0 = extractelement <2 x double> %v, i32 0
   %tmp1 = extractelement <2 x double> %v, i32 1
@@ -340,8 +334,7 @@ define float @test_fmls_ss2S_1(float %a, float %b, <2 x float> %v) {
 define float @test_fmls_ss4S_3_ext0(float %a, <4 x float> %v) {
 ; CHECK-LABEL: test_fmls_ss4S_3_ext0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov s2, v1.s[3]
-; CHECK-NEXT:    fmsub s0, s1, s2, s0
+; CHECK-NEXT:    fmls s0, s1, v1.s[3]
 ; CHECK-NEXT:    ret
   %tmp0 = extractelement <4 x float> %v, i32 0
   %tmp1 = extractelement <4 x float> %v, i32 3
@@ -437,8 +430,7 @@ define double @test_fmls_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
 define double @test_fmls_dd2D_1_ext0(double %a, <2 x double> %v) {
 ; CHECK-LABEL: test_fmls_dd2D_1_ext0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d2, v1.d[1]
-; CHECK-NEXT:    fmsub d0, d1, d2, d0
+; CHECK-NEXT:    fmls d0, d1, v1.d[1]
 ; CHECK-NEXT:    ret
   %tmp0 = extractelement <2 x double> %v, i32 0
   %tmp1 = extractelement <2 x double> %v, i32 1
diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll
index 3263a5e..a11e386 100644
--- a/llvm/test/CodeGen/AArch64/ptradd.ll
+++ b/llvm/test/CodeGen/AArch64/ptradd.ll
@@ -1,40 +1,58 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -mtriple=aarch64-none-eabi -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s -mtriple=aarch64 -verify-machineinstrs -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64 -verify-machineinstrs -global-isel=1 -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; Note: these tests use stores instead of returns as the return handling for
-; vector ptrs is currently sometimes create invalid unmerge values.
+; CHECK-GI:       warning: Instruction selection used fallback path for vector_gep_v3i32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for vector_gep_v3i64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for vector_gep_v3i64_base
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for vector_gep_v3i64_c10
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for vector_gep_v3i64_cm10
 
-define void @vector_gep_i32(ptr %b, i32 %off, ptr %p) {
-; CHECK-LABEL: vector_gep_i32:
+define ptr @scalar_gep_i32(ptr %b, i32 %off) {
+; CHECK-LABEL: scalar_gep_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, w1, sxtw
-; CHECK-NEXT:    str x8, [x2]
+; CHECK-NEXT:    add x0, x0, w1, sxtw
 ; CHECK-NEXT:    ret
 entry:
   %g = getelementptr i8, ptr %b, i32 %off
-  store ptr %g, ptr %p
-  ret void
+  ret ptr %g
 }
 
-define void @vector_gep_i64(ptr %b, i64 %off, ptr %p) {
-; CHECK-LABEL: vector_gep_i64:
+define ptr @scalar_gep_i64(ptr %b, i64 %off) {
+; CHECK-LABEL: scalar_gep_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, x1
-; CHECK-NEXT:    str x8, [x2]
+; CHECK-NEXT:    add x0, x0, x1
 ; CHECK-NEXT:    ret
 entry:
   %g = getelementptr i8, ptr %b, i64 %off
-  store ptr %g, ptr %p
-  ret void
+  ret ptr %g
 }
 
-define void @vector_gep_v1i32(<1 x ptr> %b, <1 x i32> %off, ptr %p) {
+define ptr @scalar_gep_c10(ptr %b) {
+; CHECK-LABEL: scalar_gep_c10:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add x0, x0, #10
+; CHECK-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, i64 10
+  ret ptr %g
+}
+
+define ptr @scalar_gep_cm10(ptr %b) {
+; CHECK-LABEL: scalar_gep_cm10:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub x0, x0, #10
+; CHECK-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, i64 -10
+  ret ptr %g
+}
+
+define <1 x ptr> @vector_gep_v1i32(<1 x ptr> %b, <1 x i32> %off) {
 ; CHECK-SD-LABEL: vector_gep_v1i32:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    shl d1, d1, #32
 ; CHECK-SD-NEXT:    ssra d0, d1, #32
-; CHECK-SD-NEXT:    str d0, [x0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: vector_gep_v1i32:
@@ -42,87 +60,64 @@ define void @vector_gep_v1i32(<1 x ptr> %b, <1 x i32> %off, ptr %p) {
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    fmov x9, d0
 ; CHECK-GI-NEXT:    add x8, x9, w8, sxtw
-; CHECK-GI-NEXT:    str x8, [x0]
+; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %g = getelementptr i8, <1 x ptr> %b, <1 x i32> %off
-  store <1 x ptr> %g, ptr %p
-  ret void
+  ret <1 x ptr> %g
 }
 
-define void @vector_gep_v2i32(<2 x ptr> %b, <2 x i32> %off, ptr %p) {
+define <2 x ptr> @vector_gep_v2i32(<2 x ptr> %b, <2 x i32> %off) {
 ; CHECK-LABEL: vector_gep_v2i32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    saddw v0.2d, v0.2d, v1.2s
-; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
 entry:
   %g = getelementptr i8, <2 x ptr> %b, <2 x i32> %off
-  store <2 x ptr> %g, ptr %p
-  ret void
+  ret <2 x ptr> %g
 }
 
-define void @vector_gep_v3i32(<3 x ptr> %b, <3 x i32> %off, ptr %p) {
-; CHECK-SD-LABEL: vector_gep_v3i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    saddw2 v2.2d, v2.2d, v3.4s
-; CHECK-SD-NEXT:    str d2, [x0, #16]
-; CHECK-SD-NEXT:    saddw v0.2d, v0.2d, v3.2s
-; CHECK-SD-NEXT:    str q0, [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: vector_gep_v3i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    smov x9, v3.s[0]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    smov x10, v3.s[1]
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    fmov x8, d1
-; CHECK-GI-NEXT:    mov v4.d[0], x9
-; CHECK-GI-NEXT:    fmov x9, d2
-; CHECK-GI-NEXT:    mov v0.d[1], x8
-; CHECK-GI-NEXT:    mov w8, v3.s[2]
-; CHECK-GI-NEXT:    mov v4.d[1], x10
-; CHECK-GI-NEXT:    add x8, x9, w8, sxtw
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-GI-NEXT:    str x8, [x0, #16]
-; CHECK-GI-NEXT:    str q0, [x0]
-; CHECK-GI-NEXT:    ret
+define <3 x ptr> @vector_gep_v3i32(<3 x ptr> %b, <3 x i32> %off) {
+; CHECK-LABEL: vector_gep_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    ext v4.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    saddw v2.2d, v2.2d, v4.2s
+; CHECK-NEXT:    saddw v0.2d, v0.2d, v3.2s
+; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    ret
 entry:
   %g = getelementptr i8, <3 x ptr> %b, <3 x i32> %off
-  store <3 x ptr> %g, ptr %p
-  ret void
+  ret <3 x ptr> %g
 }
 
-define void @vector_gep_v4i32(<4 x ptr> %b, <4 x i32> %off, ptr %p) {
+define <4 x ptr> @vector_gep_v4i32(<4 x ptr> %b, <4 x i32> %off) {
 ; CHECK-SD-LABEL: vector_gep_v4i32:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
 ; CHECK-SD-NEXT:    saddw v0.2d, v0.2d, v2.2s
-; CHECK-SD-NEXT:    stp q0, q1, [x0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: vector_gep_v4i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    saddw v0.2d, v0.2d, v2.2s
 ; CHECK-GI-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
-; CHECK-GI-NEXT:    stp q0, q1, [x0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %g = getelementptr i8, <4 x ptr> %b, <4 x i32> %off
-  store <4 x ptr> %g, ptr %p
-  ret void
+  ret <4 x ptr> %g
 }
 
-define void @vector_gep_v1i64(<1 x ptr> %b, <1 x i64> %off, ptr %p) {
+define <1 x ptr> @vector_gep_v1i64(<1 x ptr> %b, <1 x i64> %off) {
 ; CHECK-SD-LABEL: vector_gep_v1i64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    add d0, d0, d1
-; CHECK-SD-NEXT:    str d0, [x0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: vector_gep_v1i64:
@@ -130,90 +125,58 @@ define void @vector_gep_v1i64(<1 x ptr> %b, <1 x i64> %off, ptr %p) {
 ; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    fmov x9, d1
 ; CHECK-GI-NEXT:    add x8, x8, x9
-; CHECK-GI-NEXT:    str x8, [x0]
+; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %g = getelementptr i8, <1 x ptr> %b, <1 x i64> %off
-  store <1 x ptr> %g, ptr %p
-  ret void
+  ret <1 x ptr> %g
 }
 
-define void @vector_gep_v2i64(<2 x ptr> %b, <2 x i64> %off, ptr %p) {
+define <2 x ptr> @vector_gep_v2i64(<2 x ptr> %b, <2 x i64> %off) {
 ; CHECK-LABEL: vector_gep_v2i64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
 entry:
   %g = getelementptr i8, <2 x ptr> %b, <2 x i64> %off
-  store <2 x ptr> %g, ptr %p
-  ret void
+  ret <2 x ptr> %g
 }
 
-define void @vector_gep_v3i64(<3 x ptr> %b, <3 x i64> %off, ptr %p) {
-; CHECK-SD-LABEL: vector_gep_v3i64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT:    mov v3.d[1], v4.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    add d1, d2, d5
-; CHECK-SD-NEXT:    str d1, [x0, #16]
-; CHECK-SD-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-SD-NEXT:    str q0, [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: vector_gep_v3i64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    fmov x8, d1
-; CHECK-GI-NEXT:    mov v0.d[1], x8
-; CHECK-GI-NEXT:    fmov x8, d2
-; CHECK-GI-NEXT:    add x8, x8, x9
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-GI-NEXT:    str x8, [x0, #16]
-; CHECK-GI-NEXT:    str q0, [x0]
-; CHECK-GI-NEXT:    ret
+define <3 x ptr> @vector_gep_v3i64(<3 x ptr> %b, <3 x i64> %off) {
+; CHECK-LABEL: vector_gep_v3i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add d0, d0, d3
+; CHECK-NEXT:    add d1, d1, d4
+; CHECK-NEXT:    add d2, d2, d5
+; CHECK-NEXT:    ret
 entry:
   %g = getelementptr i8, <3 x ptr> %b, <3 x i64> %off
-  store <3 x ptr> %g, ptr %p
-  ret void
+  ret <3 x ptr> %g
 }
 
-define void @vector_gep_v4i64(<4 x ptr> %b, <4 x i64> %off, ptr %p) {
+define <4 x ptr> @vector_gep_v4i64(<4 x ptr> %b, <4 x i64> %off) {
 ; CHECK-SD-LABEL: vector_gep_v4i64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v3.2d
 ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-NEXT:    stp q0, q1, [x0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: vector_gep_v4i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
 ; CHECK-GI-NEXT:    add v1.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT:    stp q0, q1, [x0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %g = getelementptr i8, <4 x ptr> %b, <4 x i64> %off
-  store <4 x ptr> %g, ptr %p
-  ret void
+  ret <4 x ptr> %g
 }
 
-define void @vector_gep_v4i128(<2 x ptr> %b, <2 x i128> %off, ptr %p) {
+define <2 x ptr> @vector_gep_v4i128(<2 x ptr> %b, <2 x i128> %off) {
 ; CHECK-SD-LABEL: vector_gep_v4i128:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fmov d1, x0
 ; CHECK-SD-NEXT:    mov v1.d[1], x2
 ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-NEXT:    str q0, [x4]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: vector_gep_v4i128:
@@ -221,10 +184,231 @@ define void @vector_gep_v4i128(<2 x ptr> %b, <2 x i128> %off, ptr %p) {
 ; CHECK-GI-NEXT:    mov v1.d[0], x0
 ; CHECK-GI-NEXT:    mov v1.d[1], x2
 ; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-GI-NEXT:    str q0, [x4]
 ; CHECK-GI-NEXT:    ret
 entry:
   %g = getelementptr i8, <2 x ptr> %b, <2 x i128> %off
-  store <2 x ptr> %g, ptr %p
-  ret void
+  ret <2 x ptr> %g
+}
+
+
+define <1 x ptr> @vector_gep_v1i64_base(ptr %b, <1 x i64> %off) {
+; CHECK-SD-LABEL: vector_gep_v1i64_base:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    add d0, d1, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_gep_v1i64_base:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x8, x0, x8
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <1 x i64> %off
+  ret <1 x ptr> %g
+}
+
+define <2 x ptr> @vector_gep_v2i64_base(ptr %b, <2 x i64> %off) {
+; CHECK-LABEL: vector_gep_v2i64_base:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.2d, x0
+; CHECK-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <2 x i64> %off
+  ret <2 x ptr> %g
+}
+
+define <3 x ptr> @vector_gep_v3i64_base(ptr %b, <3 x i64> %off) {
+; CHECK-LABEL: vector_gep_v3i64_base:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmov d3, x0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    dup v1.2d, x0
+; CHECK-NEXT:    add d2, d3, d2
+; CHECK-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <3 x i64> %off
+  ret <3 x ptr> %g
+}
+
+define <4 x ptr> @vector_gep_v4i64_base(ptr %b, <4 x i64> %off) {
+; CHECK-LABEL: vector_gep_v4i64_base:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.2d, x0
+; CHECK-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <4 x i64> %off
+  ret <4 x ptr> %g
+}
+
+define <1 x ptr> @vector_gep_v1i64_c10(ptr %b) {
+; CHECK-SD-LABEL: vector_gep_v1i64_c10:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #10 // =0xa
+; CHECK-SD-NEXT:    fmov d0, x0
+; CHECK-SD-NEXT:    fmov d1, x8
+; CHECK-SD-NEXT:    add d0, d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_gep_v1i64_c10:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add x8, x0, #10
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <1 x i64> <i64 10>
+  ret <1 x ptr> %g
+}
+
+define <2 x ptr> @vector_gep_v2i64_c10(ptr %b) {
+; CHECK-SD-LABEL: vector_gep_v2i64_c10:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #10 // =0xa
+; CHECK-SD-NEXT:    dup v0.2d, x0
+; CHECK-SD-NEXT:    dup v1.2d, x8
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_gep_v2i64_c10:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI18_0
+; CHECK-GI-NEXT:    dup v0.2d, x0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <2 x i64> <i64 10, i64 10>
+  ret <2 x ptr> %g
+}
+
+define <3 x ptr> @vector_gep_v3i64_c10(ptr %b) {
+; CHECK-LABEL: vector_gep_v3i64_c10:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    dup v0.2d, x0
+; CHECK-NEXT:    fmov d3, x0
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add d2, d3, d2
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <3 x i64> <i64 10, i64 10, i64 10>
+  ret <3 x ptr> %g
+}
+
+define <4 x ptr> @vector_gep_v4i64_c10(ptr %b) {
+; CHECK-SD-LABEL: vector_gep_v4i64_c10:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #10 // =0xa
+; CHECK-SD-NEXT:    dup v0.2d, x0
+; CHECK-SD-NEXT:    dup v1.2d, x8
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_gep_v4i64_c10:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI20_0
+; CHECK-GI-NEXT:    dup v0.2d, x0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI20_0]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    mov v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <4 x i64> <i64 10, i64 10, i64 10, i64 10>
+  ret <4 x ptr> %g
+}
+
+define <1 x ptr> @vector_gep_v1i64_cm10(ptr %b) {
+; CHECK-SD-LABEL: vector_gep_v1i64_cm10:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #-10 // =0xfffffffffffffff6
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    add d0, d1, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_gep_v1i64_cm10:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub x8, x0, #10
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <1 x i64> <i64 -10>
+  ret <1 x ptr> %g
+}
+
+define <2 x ptr> @vector_gep_v2i64_cm10(ptr %b) {
+; CHECK-SD-LABEL: vector_gep_v2i64_cm10:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #-10 // =0xfffffffffffffff6
+; CHECK-SD-NEXT:    dup v1.2d, x0
+; CHECK-SD-NEXT:    dup v0.2d, x8
+; CHECK-SD-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_gep_v2i64_cm10:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI22_0
+; CHECK-GI-NEXT:    dup v0.2d, x0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI22_0]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <2 x i64> <i64 -10, i64 -10>
+  ret <2 x ptr> %g
+}
+
+define <3 x ptr> @vector_gep_v3i64_cm10(ptr %b) {
+; CHECK-LABEL: vector_gep_v3i64_cm10:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, #-10 // =0xfffffffffffffff6
+; CHECK-NEXT:    dup v0.2d, x0
+; CHECK-NEXT:    fmov d3, x0
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add d2, d3, d2
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <3 x i64> <i64 -10, i64 -10, i64 -10>
+  ret <3 x ptr> %g
+}
+
+define <4 x ptr> @vector_gep_v4i64_cm10(ptr %b) {
+; CHECK-SD-LABEL: vector_gep_v4i64_cm10:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #-10 // =0xfffffffffffffff6
+; CHECK-SD-NEXT:    dup v1.2d, x0
+; CHECK-SD-NEXT:    dup v0.2d, x8
+; CHECK-SD-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_gep_v4i64_cm10:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI24_0
+; CHECK-GI-NEXT:    dup v0.2d, x0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI24_0]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    mov v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %g = getelementptr i8, ptr %b, <4 x i64> <i64 -10, i64 -10, i64 -10, i64 -10>
+  ret <4 x ptr> %g
 }
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll
index fe8271c..3bfaf6d 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2-aes,+sve2-sha3,+sve2-sm4 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2,+sve-aes,+sve2-sha3,+sve2-sm4 < %s | FileCheck %s
 
 ;
 ; AESD
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.i1.ll
new file mode 100644
index 0000000..74422a1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.i1.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=WAVE64 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=WAVE32 %s
+
+define i32 @s_andn2_i1_vcc(i32 %arg0, i32 %arg1) {
+; WAVE64-LABEL: s_andn2_i1_vcc:
+; WAVE64:       ; %bb.0:
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; WAVE64-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v1
+; WAVE64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; WAVE64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; WAVE64-NEXT:    s_setpc_b64 s[30:31]
+;
+; WAVE32-LABEL: s_andn2_i1_vcc:
+; WAVE32:       ; %bb.0:
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; WAVE32-NEXT:    v_cmp_ne_u32_e64 s4, 0, v1
+; WAVE32-NEXT:    s_and_b32 s4, vcc_lo, s4
+; WAVE32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; WAVE32-NEXT:    s_setpc_b64 s[30:31]
+  %src0 = icmp eq i32 %arg0, 0
+  %src1 = icmp eq i32 %arg1, 0
+  %not.src1 = xor i1 %src1, true
+  %and = and i1 %src0, %not.src1
+  %select = select i1 %and, i32 1, i32 0
+  ret i32 %select
+}
+
+define i32 @s_andn2_i1_vcc_commute(i32 %arg0, i32 %arg1) {
+; WAVE64-LABEL: s_andn2_i1_vcc_commute:
+; WAVE64:       ; %bb.0:
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; WAVE64-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v1
+; WAVE64-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; WAVE64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; WAVE64-NEXT:    s_setpc_b64 s[30:31]
+;
+; WAVE32-LABEL: s_andn2_i1_vcc_commute:
+; WAVE32:       ; %bb.0:
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; WAVE32-NEXT:    v_cmp_ne_u32_e64 s4, 0, v1
+; WAVE32-NEXT:    s_and_b32 s4, s4, vcc_lo
+; WAVE32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; WAVE32-NEXT:    s_setpc_b64 s[30:31]
+  %src0 = icmp eq i32 %arg0, 0
+  %src1 = icmp eq i32 %arg1, 0
+  %not.src1 = xor i1 %src1, true
+  %and = and i1 %not.src1, %src0
+  %select = select i1 %and, i32 1, i32 0
+  ret i32 %select
+}
+
+define i32 @s_andn2_i1_vcc_multi_use(i32 %arg0, i32 %arg1) {
+; WAVE64-LABEL: s_andn2_i1_vcc_multi_use:
+; WAVE64:       ; %bb.0:
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v1
+; WAVE64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; WAVE64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; WAVE64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; WAVE64-NEXT:    v_cndmask_b32_e64 v0, v0, 1, s[4:5]
+; WAVE64-NEXT:    s_setpc_b64 s[30:31]
+;
+; WAVE32-LABEL: s_andn2_i1_vcc_multi_use:
+; WAVE32:       ; %bb.0:
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    v_cmp_ne_u32_e64 s4, 0, v1
+; WAVE32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; WAVE32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; WAVE32-NEXT:    s_and_b32 s4, vcc_lo, s4
+; WAVE32-NEXT:    v_cndmask_b32_e64 v0, v0, 1, s4
+; WAVE32-NEXT:    s_setpc_b64 s[30:31]
+  %src0 = icmp eq i32 %arg0, 0
+  %src1 = icmp eq i32 %arg1, 0
+  %not.src1 = xor i1 %src1, -1
+  %user = zext i1 %not.src1 to i32
+  %and = and i1 %src0, %not.src1
+  %select = select i1 %and, i32 1, i32 %user
+  ret i32 %select
+}
+
+define <2 x i32> @s_andn2_v2i1_vcc(<2 x i32> %arg0, <2 x i32> %arg1) {
+; WAVE64-LABEL: s_andn2_v2i1_vcc:
+; WAVE64:       ; %bb.0:
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; WAVE64-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v2
+; WAVE64-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v1
+; WAVE64-NEXT:    v_cmp_ne_u32_e64 s[8:9], 0, v3
+; WAVE64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; WAVE64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; WAVE64-NEXT:    s_and_b64 s[4:5], s[6:7], s[8:9]
+; WAVE64-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; WAVE64-NEXT:    s_setpc_b64 s[30:31]
+;
+; WAVE32-LABEL: s_andn2_v2i1_vcc:
+; WAVE32:       ; %bb.0:
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; WAVE32-NEXT:    v_cmp_ne_u32_e64 s4, 0, v2
+; WAVE32-NEXT:    v_cmp_eq_u32_e64 s5, 0, v1
+; WAVE32-NEXT:    v_cmp_ne_u32_e64 s6, 0, v3
+; WAVE32-NEXT:    s_and_b32 s4, vcc_lo, s4
+; WAVE32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; WAVE32-NEXT:    s_and_b32 s4, s5, s6
+; WAVE32-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; WAVE32-NEXT:    s_setpc_b64 s[30:31]
+  %src0 = icmp eq <2 x i32> %arg0, zeroinitializer
+  %src1 = icmp eq <2 x i32> %arg1, zeroinitializer
+  %not.src1 = xor <2 x i1> %src1, <i1 true, i1 true>
+  %and = and <2 x i1> %src0, %not.src1
+  %select = select <2 x i1> %and, <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
+  ret <2 x i32> %select
+}
+
+define <2 x i32> @s_andn2_v2i1_vcc_commute(<2 x i32> %arg0, <2 x i32> %arg1) {
+; WAVE64-LABEL: s_andn2_v2i1_vcc_commute:
+; WAVE64:       ; %bb.0:
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; WAVE64-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v2
+; WAVE64-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v1
+; WAVE64-NEXT:    v_cmp_ne_u32_e64 s[8:9], 0, v3
+; WAVE64-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; WAVE64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; WAVE64-NEXT:    s_and_b64 s[4:5], s[8:9], s[6:7]
+; WAVE64-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; WAVE64-NEXT:    s_setpc_b64 s[30:31]
+;
+; WAVE32-LABEL: s_andn2_v2i1_vcc_commute:
+; WAVE32:       ; %bb.0:
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; WAVE32-NEXT:    v_cmp_ne_u32_e64 s4, 0, v2
+; WAVE32-NEXT:    v_cmp_eq_u32_e64 s5, 0, v1
+; WAVE32-NEXT:    v_cmp_ne_u32_e64 s6, 0, v3
+; WAVE32-NEXT:    s_and_b32 s4, s4, vcc_lo
+; WAVE32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; WAVE32-NEXT:    s_and_b32 s4, s6, s5
+; WAVE32-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; WAVE32-NEXT:    s_setpc_b64 s[30:31]
+  %src0 = icmp eq <2 x i32> %arg0, zeroinitializer
+  %src1 = icmp eq <2 x i32> %arg1, zeroinitializer
+  %not.src1 = xor <2 x i1> %src1, <i1 true, i1 true>
+  %and = and <2 x i1> %not.src1, %src0
+  %select = select <2 x i1> %and, <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
+  ret <2 x i32> %select
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 63f5464..493e8ce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1669,7 +1669,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v3
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[0:1], v3
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], v8
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v3
 ; GFX6-NEXT:    v_ashr_i64 v[10:11], v[4:5], v3
 ; GFX6-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v5
@@ -1692,7 +1692,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v8, v[4:5]
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffc0, v3
 ; GFX8-NEXT:    v_ashrrev_i64 v[10:11], v3, v[4:5]
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v5
@@ -1715,7 +1715,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v8, v[4:5]
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 64, v3
+; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffc0, v3
 ; GFX9-NEXT:    v_ashrrev_i64 v[10:11], v3, v[4:5]
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v5
@@ -1735,7 +1735,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfe_i32 v4, v2, 0, 1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v3
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
@@ -1758,7 +1758,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfe_i32 v4, v2, 0, 1
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v3
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v3
 ; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 405b1e8..46d6b86 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1438,7 +1438,7 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
 ; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; SI-NEXT:    v_ffbh_i32_e32 v3, 0
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
-; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    v_min_u32_e32 v2, v3, v2
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
@@ -1456,7 +1456,7 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
 ; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; VI-NEXT:    v_ffbh_i32_e32 v3, 0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
-; VI-NEXT:    v_subrev_u32_e32 v3, vcc, 1, v3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_min_u32_e32 v2, v3, v2
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 146f344..6e55d7f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -4101,7 +4101,7 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #0 {
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, 0x3f40e400
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, 14, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, -14, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX10-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4112,10 +4112,9 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #0 {
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, 0x3f40e400
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, 14, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT:    v_dual_mul_f32 v1, v2, v1 :: v_dual_add_nc_u32 v0, -14, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 3bd3486..5d76b54 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -20,10 +20,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 7
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 7, v0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, -7, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 7, v0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, -7, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 6, v0
@@ -51,10 +51,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 7
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 7, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -7, v0
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 7, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -7, v0
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v1, 6, v0
@@ -82,10 +82,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 7
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v1, 7, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, -7, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v1, 7, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, -7, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v1, 6, v0
@@ -113,10 +113,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 7
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -7, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -7, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u16 v1, 6, v0
@@ -150,11 +150,11 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, -7, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, -7, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -189,10 +189,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 7, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 7, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 6, v2
@@ -219,10 +219,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 7, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -7, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 7, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -7, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v3, 6, v2
@@ -249,10 +249,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 7, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, -7, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 7, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, -7, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v3, 6, v2
@@ -279,10 +279,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, -7, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, -7, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u16 v3, 6, v2
@@ -315,11 +315,11 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX11-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, -7, v2
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, -7, v2
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
@@ -1550,16 +1550,16 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v0, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 23, v0
@@ -1580,16 +1580,16 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v0, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 23, v0
@@ -1616,10 +1616,10 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v1, 24, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 0xffffffe8, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v1, 24, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 0xffffffe8, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 23, v0
@@ -1644,10 +1644,10 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0xffffffe8, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0xffffffe8, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 23, v0
@@ -1678,11 +1678,11 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0xffffffe8, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0xffffffe8, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1710,16 +1710,16 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 23
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
-; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, v3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffe8, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v2
@@ -1740,16 +1740,16 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v1, 1, 23
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_lo_u32 v4, v3, v4
-; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
+; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xffffffe8, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v2
@@ -1776,10 +1776,10 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0xffffffe8, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0xffffffe8, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v2
@@ -1804,10 +1804,10 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v2
@@ -1838,11 +1838,11 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX11-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v2
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v2
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1887,7 +1887,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s0, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s1, s3, 8
 ; GFX6-NEXT:    s_bfe_u32 s8, s2, 0x80008
-; GFX6-NEXT:    v_mul_lo_u32 v3, v2, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, v2, v3
 ; GFX6-NEXT:    s_and_b32 s7, s2, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 8
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
@@ -1906,7 +1906,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX6-NEXT:    s_bfe_u32 s7, s4, 0x80008
-; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
 ; GFX6-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
@@ -1915,53 +1915,53 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s1, s3, s1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v3, s1, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v2
 ; GFX6-NEXT:    s_lshr_b32 s2, s5, 8
 ; GFX6-NEXT:    s_and_b32 s3, s5, 0xff
-; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX6-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX6-NEXT:    v_alignbit_b32 v4, s3, v4, 24
+; GFX6-NEXT:    v_alignbit_b32 v5, s3, v5, 24
 ; GFX6-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 24
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX6-NEXT:    v_mul_lo_u32 v4, v4, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    v_or_b32_e32 v4, s2, v4
-; GFX6-NEXT:    v_mul_hi_u32 v2, v4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 24, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v3
+; GFX6-NEXT:    v_or_b32_e32 v5, s2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v2, v5, v2
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s1, v4
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v4, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, 24
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 24, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 23, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v4, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 23, v4
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v2, v3
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v3
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_lshl_b32_e32 v3, s6, v3
-; GFX6-NEXT:    v_lshr_b32_e32 v5, s0, v5
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 23, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT:    v_lshl_b32_e32 v4, s6, v4
+; GFX6-NEXT:    v_lshr_b32_e32 v6, s0, v6
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX6-NEXT:    v_bfe_u32 v2, v4, 8, 8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_bfe_u32 v2, v3, 16, 8
+; GFX6-NEXT:    v_bfe_u32 v2, v4, 16, 8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v0
@@ -2021,7 +2021,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_not_b32_e32 v1, 23
 ; GFX8-NEXT:    s_or_b32 s3, s8, s3
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s6
@@ -2031,67 +2031,67 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 24
 ; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_or_b32 s4, s4, s6
 ; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s4, s4, s6
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, s4, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, s4, v0
 ; GFX8-NEXT:    s_lshr_b32 s9, s5, 8
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX8-NEXT:    v_mul_lo_u32 v2, v2, 24
 ; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
 ; GFX8-NEXT:    s_or_b32 s5, s8, s5
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s5, s5, s6
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s4, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX8-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
+; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e64 v3, v3, s0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s5, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v0, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v0, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 23, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX8-NEXT:    s_lshr_b32 s0, s3, 1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
-; GFX8-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 16
-; GFX8-NEXT:    v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2172,10 +2172,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_or_b32 s5, s5, s6
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffe8, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffe8, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2186,10 +2186,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s5, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v1, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffe8, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffe8, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v0
@@ -2282,9 +2282,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    s_lshr_b32 s4, s3, 8
 ; GFX10-NEXT:    s_and_b32 s5, s9, 0xff
 ; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 0xffffffe8, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v0
 ; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
@@ -2293,13 +2293,13 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX10-NEXT:    s_or_b32 s3, s10, s3
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 0xffffffe8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX10-NEXT:    s_or_b32 s2, s2, s5
 ; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    s_or_b32 s3, s3, s4
@@ -2399,9 +2399,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX11-NEXT:    s_and_b32 s5, s8, 0xff
 ; GFX11-NEXT:    s_lshr_b32 s4, s3, 8
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0xffffffe8, v1
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v0
 ; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
@@ -2410,7 +2410,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX11-NEXT:    s_or_b32 s2, s2, s5
 ; GFX11-NEXT:    s_or_b32 s3, s9, s3
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0xffffffe8, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
@@ -2423,7 +2423,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX11-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
@@ -2479,31 +2479,31 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX6-NEXT:    v_mul_lo_u32 v7, v6, v7
-; GFX6-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GFX6-NEXT:    v_mul_hi_u32 v7, v4, v6
+; GFX6-NEXT:    v_mul_lo_u32 v8, v6, v7
+; GFX6-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GFX6-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX6-NEXT:    v_mul_lo_u32 v8, v8, 24
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v7
-; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v8
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v4, v7
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v4, v7
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 23, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 23, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v7
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v8
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v5, v6
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v2, v7
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffe8, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 23, v2
@@ -2526,31 +2526,31 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX8-NEXT:    v_mul_lo_u32 v7, v6, v7
-; GFX8-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
-; GFX8-NEXT:    v_mul_hi_u32 v7, v4, v6
+; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v7
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT:    v_mul_hi_u32 v8, v4, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GFX8-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX8-NEXT:    v_mul_lo_u32 v8, v8, 24
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v7
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v7
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v7
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 23, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 23, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v7
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v5, v6
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 24, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v2, v7
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 24, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffffe8, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 23, v2
@@ -2583,21 +2583,21 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v7
 ; GFX9-NEXT:    v_sub_u32_e32 v5, v5, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, 0xffffffe8, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, 0xffffffe8, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 23, v4
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v7, 24, v5
+; GFX9-NEXT:    v_add_u32_e32 v7, 0xffffffe8, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v5
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v7, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
+; GFX9-NEXT:    v_add_u32_e32 v4, 0xffffffe8, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 23, v2
@@ -2627,15 +2627,15 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v7
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0xffffffe8, v4
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0xffffffe8, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0xffffffe8, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0xffffffe8, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
@@ -2679,34 +2679,32 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0xffffffe8, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0xffffffe8, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0xffffffe8, v5
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_add_nc_u32 v7, 0xffffffe8, v5
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
-; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v1, v1, v5, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
+; GFX11-NEXT:    v_lshl_or_b32 v1, v1, v5, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
   ret <2 x i24> %result
@@ -6061,11 +6059,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 64, v15
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v15
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, 0xffffffc0, v15
 ; GFX6-NEXT:    v_lshr_b64 v[9:10], v[0:1], v9
 ; GFX6-NEXT:    v_lshl_b64 v[11:12], v[2:3], v15
 ; GFX6-NEXT:    v_lshl_b64 v[13:14], v[0:1], v15
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v16
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v17
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX6-NEXT:    v_or_b32_e32 v10, v10, v12
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
@@ -6082,8 +6080,9 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[6:7], 1
 ; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v4
+; GFX6-NEXT:    v_not_b32_e32 v16, 63
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v14
-; GFX6-NEXT:    v_subrev_i32_e32 v15, vcc, 64, v14
+; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v14, v16
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], v14
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], v6
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], v14
@@ -6109,11 +6108,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, 64, v15
-; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v15
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xffffffc0, v15
 ; GFX8-NEXT:    v_lshrrev_b64 v[9:10], v9, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
 ; GFX8-NEXT:    v_lshlrev_b64 v[13:14], v15, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v17, v[0:1]
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX8-NEXT:    v_or_b32_e32 v10, v10, v12
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
@@ -6130,8 +6129,9 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[6:7]
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v4
+; GFX8-NEXT:    v_not_b32_e32 v16, 63
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v14
-; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, 64, v14
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v14, v16
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v14, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v14, v[2:3]
@@ -6157,7 +6157,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v8
 ; GFX9-NEXT:    v_sub_u32_e32 v9, 64, v15
-; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v15
+; GFX9-NEXT:    v_add_u32_e32 v16, 0xffffffc0, v15
 ; GFX9-NEXT:    v_lshrrev_b64 v[9:10], v9, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
 ; GFX9-NEXT:    v_lshlrev_b64 v[13:14], v15, v[0:1]
@@ -6178,7 +6178,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v4
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 31, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v14
-; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT:    v_add_u32_e32 v15, 0xffffffc0, v14
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v14, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v14, v[2:3]
@@ -6210,7 +6210,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v10
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 31, v5
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
+; GFX10-NEXT:    v_add_nc_u32_e32 v20, 0xffffffc0, v18
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v11, v[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v18, v[0:1]
@@ -6218,7 +6218,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
 ; GFX10-NEXT:    v_or_b32_e32 v10, v10, v8
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 0xffffffc0, v19
 ; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[12:13]
 ; GFX10-NEXT:    v_or_b32_e32 v11, v11, v9
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v19
@@ -6258,34 +6258,34 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v10
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
 ; GFX11-NEXT:    v_lshl_or_b32 v5, v6, 31, v5
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v18, v[0:1]
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v11, v[0:1]
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v18, v[0:1]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0xffffffc0, v18
+; GFX11-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v19
 ; GFX11-NEXT:    v_or_b32_e32 v10, v10, v8
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0, v7, vcc_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0xffffffc0, v19
 ; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v16, v[12:13]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX11-NEXT:    v_or_b32_e32 v11, v11, v9
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v8, v[12:13]
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v8, v[12:13]
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
 ; GFX11-NEXT:    v_or_b32_e32 v14, v14, v16
 ; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
+; GFX11-NEXT:    v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v19, v[12:13]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v18
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v6 :: v_dual_cndmask_b32 v7, 0, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v18
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s0
@@ -6307,15 +6307,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 64, v7
 ; GFX6-NEXT:    v_lshr_b64 v[1:2], s[0:1], v1
 ; GFX6-NEXT:    v_lshl_b64 v[3:4], s[2:3], v7
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v7
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0xffffffc0, v7
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], s[0:1], v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v4, v2, v4
-; GFX6-NEXT:    v_lshl_b64 v[1:2], s[0:1], v8
+; GFX6-NEXT:    v_lshl_b64 v[1:2], s[0:1], v9
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GFX6-NEXT:    v_not_b32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s8, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -6324,33 +6324,34 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX6-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX6-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX6-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, v2, v4, vcc
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v10
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v10
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v11
+; GFX6-NEXT:    v_not_b32_e32 v8, 63
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v11
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v10
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[2:3], v11
-; GFX6-NEXT:    v_lshr_b64 v[4:5], s[2:3], v10
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[2:3], v8
+; GFX6-NEXT:    v_lshr_b64 v[4:5], s[2:3], v11
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v11
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v11
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i128_ssv:
@@ -6359,15 +6360,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 64, v7
 ; GFX8-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0xffffffc0, v7
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v7, s[0:1]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v4, v2, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v9, s[0:1]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    s_mov_b32 s8, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -6376,33 +6377,34 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX8-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX8-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v2, v4, vcc
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v10
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v10, s[0:1]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v11
+; GFX8-NEXT:    v_not_b32_e32 v8, 63
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v10
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v11, v8
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v10, s[2:3]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v8, s[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v11, s[2:3]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i128_ssv:
@@ -6411,7 +6413,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v7
 ; GFX9-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v7
+; GFX9-NEXT:    v_add_u32_e32 v8, 0xffffffc0, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v7, s[0:1]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_or_b32_e32 v4, v2, v4
@@ -6436,7 +6438,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v10, s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v10
+; GFX9-NEXT:    v_add_u32_e32 v11, 0xffffffc0, v10
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
@@ -6471,12 +6473,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v3, s[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v12
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, s[8:9]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v12, s[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, 64, v13
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xffffffc0, v13
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v1
@@ -6522,7 +6524,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 64, v12
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v12
 ; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v13, s[8:9]
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v3, s[0:1]
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
@@ -6531,7 +6533,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, 64, v13
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0xffffffc0, v13
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v1
 ; GFX11-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX11-NEXT:    v_or_b32_e32 v7, v7, v9
@@ -7677,12 +7679,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v16
+; GFX6-NEXT:    v_not_b32_e32 v25, 63
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 64, v19
-; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v19
+; GFX6-NEXT:    v_add_i32_e32 v26, vcc, v19, v25
 ; GFX6-NEXT:    v_lshr_b64 v[17:18], v[0:1], v17
 ; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v19
 ; GFX6-NEXT:    v_lshl_b64 v[23:24], v[0:1], v19
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v25
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v26
 ; GFX6-NEXT:    v_or_b32_e32 v17, v17, v21
 ; GFX6-NEXT:    v_or_b32_e32 v18, v18, v22
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
@@ -7700,7 +7703,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[10:11], 1
 ; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v23
-; GFX6-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v23
+; GFX6-NEXT:    v_add_i32_e32 v24, vcc, v23, v25
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v23
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v10
 ; GFX6-NEXT:    v_lshr_b64 v[16:17], v[2:3], v23
@@ -7719,7 +7722,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v18
 ; GFX6-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v19, vcc, 64, v18
+; GFX6-NEXT:    v_add_i32_e32 v19, vcc, v18, v25
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v8
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v18
 ; GFX6-NEXT:    v_lshl_b64 v[16:17], v[4:5], v18
@@ -7741,7 +7744,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[14:15], 1
 ; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v14
-; GFX6-NEXT:    v_subrev_i32_e32 v15, vcc, 64, v14
+; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v14, v25
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v14
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v10
 ; GFX6-NEXT:    v_lshr_b64 v[12:13], v[6:7], v14
@@ -7768,12 +7771,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v16
+; GFX8-NEXT:    v_not_b32_e32 v25, 63
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 64, v19
-; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v19
+; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v19, v25
 ; GFX8-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
 ; GFX8-NEXT:    v_lshlrev_b64 v[23:24], v19, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v26, v[0:1]
 ; GFX8-NEXT:    v_or_b32_e32 v17, v17, v21
 ; GFX8-NEXT:    v_or_b32_e32 v18, v18, v22
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
@@ -7791,7 +7795,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[10:11]
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v23
-; GFX8-NEXT:    v_subrev_u32_e32 v24, vcc, 64, v23
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v23, v25
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v23, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v23, v[2:3]
@@ -7810,7 +7814,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v18
 ; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, 64, v18
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v18, v25
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
 ; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v18, v[4:5]
@@ -7832,7 +7836,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v14
-; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, 64, v14
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v14, v25
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v14, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
 ; GFX8-NEXT:    v_lshrrev_b64 v[12:13], v14, v[6:7]
@@ -7860,7 +7864,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v16
 ; GFX9-NEXT:    v_sub_u32_e32 v17, 64, v19
-; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v19
+; GFX9-NEXT:    v_add_u32_e32 v25, 0xffffffc0, v19
 ; GFX9-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
 ; GFX9-NEXT:    v_lshlrev_b64 v[23:24], v19, v[0:1]
@@ -7881,7 +7885,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v8
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v10, 31, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v23
-; GFX9-NEXT:    v_subrev_u32_e32 v24, 64, v23
+; GFX9-NEXT:    v_add_u32_e32 v24, 0xffffffc0, v23
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v23, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v23, v[2:3]
@@ -7900,7 +7904,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v18
 ; GFX9-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v19, 64, v18
+; GFX9-NEXT:    v_add_u32_e32 v19, 0xffffffc0, v18
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
 ; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v18, v[4:5]
@@ -7921,7 +7925,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v14
-; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT:    v_add_u32_e32 v15, 0xffffffc0, v14
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v14, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
 ; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v14, v[6:7]
@@ -7956,13 +7960,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_lshl_or_b32 v9, v10, 31, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
+; GFX10-NEXT:    v_add_nc_u32_e32 v29, 0xffffffc0, v27
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
 ; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
 ; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
 ; GFX10-NEXT:    v_or_b32_e32 v18, v16, v18
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
+; GFX10-NEXT:    v_add_nc_u32_e32 v16, 0xffffffc0, v28
 ; GFX10-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v19, v17, v19
@@ -7999,10 +8003,10 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_lshl_or_b32 v9, v14, 31, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v22
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0xffffffc0, v24
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX10-NEXT:    v_or_b32_e32 v12, v10, v12
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v22
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v22
 ; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v22, v[8:9]
 ; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
@@ -8049,19 +8053,19 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX11-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc_lo
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
 ; GFX11-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v27
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v19, v17, v19
 ; GFX11-NEXT:    v_or_b32_e32 v18, v16, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v19, v1, v19 :: v_dual_cndmask_b32 v18, v0, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v29, 0xffffffc0, v27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v19, v17, v19
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v18, v0, v18 :: v_dual_cndmask_b32 v19, v1, v19
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0xffffffc0, v28
 ; GFX11-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v28
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
@@ -8095,26 +8099,26 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX11-NEXT:    v_lshl_or_b32 v9, v14, 31, v9
 ; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v20, 64, v22
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xffffffc0, v24
 ; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v22
 ; GFX11-NEXT:    v_or_b32_e32 v12, v10, v12
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v22
 ; GFX11-NEXT:    v_lshrrev_b64 v[18:19], v22, v[8:9]
 ; GFX11-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
 ; GFX11-NEXT:    v_or_b32_e32 v5, v11, v13
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[14:15]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0, v16, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[14:15]
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v22
 ; GFX11-NEXT:    v_or_b32_e32 v16, v18, v20
 ; GFX11-NEXT:    v_or_b32_e32 v18, v19, v21
 ; GFX11-NEXT:    v_dual_cndmask_b32 v12, v3, v12 :: v_dual_cndmask_b32 v5, v4, v5
 ; GFX11-NEXT:    v_lshrrev_b64 v[3:4], v22, v[14:15]
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v24
 ; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, v18, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 58304d2..dbc8f12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -21,10 +21,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 7
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 7, v0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, -7, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 7, v0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, -7, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 6, v0
@@ -51,10 +51,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 7
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 7, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -7, v0
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 7, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -7, v0
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v1, 6, v0
@@ -81,10 +81,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 7
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v1, 7, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, -7, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v1, 7, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, -7, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v1, 6, v0
@@ -111,10 +111,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 7
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -7, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -7, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u16 v1, 6, v0
@@ -147,11 +147,11 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 7
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, -7, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, -7, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -186,10 +186,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 7, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 7, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 6, v2
@@ -216,10 +216,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 7, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -7, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 7, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -7, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v3, 6, v2
@@ -246,10 +246,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 7, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, -7, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 7, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, -7, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v3, 6, v2
@@ -276,10 +276,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, -7, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, -7, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u16 v3, 6, v2
@@ -312,11 +312,11 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX11-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, -7, v2
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, -7, v2
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1552,16 +1552,16 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xffffff
-; GFX6-NEXT:    v_mul_lo_u32 v1, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v0, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 23, v0
@@ -1583,16 +1583,16 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xffffff
-; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v0, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 23, v0
@@ -1620,10 +1620,10 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v1, 24, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 0xffffffe8, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v1, 24, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 0xffffffe8, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 23, v0
@@ -1649,10 +1649,10 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0xffffffe8, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0xffffffe8, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 23, v0
@@ -1684,11 +1684,11 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0xffffffe8, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0xffffffe8, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1717,16 +1717,16 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
-; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, v3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffe8, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v2
@@ -1748,16 +1748,16 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX8-NEXT:    v_mul_lo_u32 v4, v3, v4
-; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
+; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xffffffe8, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v2
@@ -1785,10 +1785,10 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0xffffffe8, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0xffffffe8, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v2
@@ -1814,10 +1814,10 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v2
@@ -1849,11 +1849,11 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX11-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v2
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v2
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1888,7 +1888,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s1, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s3, 8
 ; GFX6-NEXT:    s_bfe_u32 s10, s2, 0x80008
-; GFX6-NEXT:    v_mul_lo_u32 v3, v2, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, v2, v3
 ; GFX6-NEXT:    s_or_b32 s8, s8, s9
 ; GFX6-NEXT:    s_and_b32 s9, s2, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s10, s10, 8
@@ -1908,7 +1908,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_or_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    s_lshr_b32 s2, s4, 16
 ; GFX6-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
 ; GFX6-NEXT:    s_and_b32 s7, s4, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s9, s9, 8
 ; GFX6-NEXT:    s_and_b32 s2, s2, 0xff
@@ -1917,62 +1917,62 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_or_b32 s2, s7, s2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v2
 ; GFX6-NEXT:    s_lshr_b32 s3, s5, 8
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX6-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX6-NEXT:    v_alignbit_b32 v4, s5, v4, 24
+; GFX6-NEXT:    v_alignbit_b32 v5, s5, v5, 24
 ; GFX6-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 24
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX6-NEXT:    v_mul_lo_u32 v4, v4, 24
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    v_or_b32_e32 v4, s3, v4
-; GFX6-NEXT:    v_mul_hi_u32 v2, v4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 24, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v3
+; GFX6-NEXT:    v_or_b32_e32 v5, s3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v2, v5, v2
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s2, v4
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v4, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, 24
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 24, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 23, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v4, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 23, v4
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v2, v3
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX6-NEXT:    s_and_b32 s8, 0xffff, s8
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v3
 ; GFX6-NEXT:    s_lshl_b32 s2, s6, 17
 ; GFX6-NEXT:    s_lshl_b32 s3, s8, 1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_lshl_b32_e32 v5, s2, v5
-; GFX6-NEXT:    v_lshr_b32_e32 v3, s1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 23, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT:    v_lshl_b32_e32 v6, s2, v6
+; GFX6-NEXT:    v_lshr_b32_e32 v4, s1, v4
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v2
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 17
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    v_or_b32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX6-NEXT:    v_bfe_u32 v2, v4, 8, 8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_bfe_u32 v2, v3, 16, 8
+; GFX6-NEXT:    v_bfe_u32 v2, v4, 16, 8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v0
@@ -2024,7 +2024,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_not_b32_e32 v1, 23
 ; GFX8-NEXT:    s_or_b32 s3, s10, s3
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s8
@@ -2034,75 +2034,75 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s10, s4, 24
 ; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_or_b32 s4, s4, s8
 ; GFX8-NEXT:    s_and_b32 s8, s9, 0xff
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX8-NEXT:    s_or_b32 s4, s4, s8
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, s4, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, s4, v0
 ; GFX8-NEXT:    s_lshr_b32 s11, s5, 8
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX8-NEXT:    v_mul_lo_u32 v2, v2, 24
 ; GFX8-NEXT:    s_and_b32 s8, s11, 0xff
 ; GFX8-NEXT:    s_or_b32 s5, s10, s5
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
 ; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX8-NEXT:    s_or_b32 s5, s5, s8
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s4, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v2
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_or_b32 s0, s4, s0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s5, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v0, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v0, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 23, v0
 ; GFX8-NEXT:    s_lshl_b32 s0, s7, 17
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s3
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 16
-; GFX8-NEXT:    v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2175,11 +2175,11 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_or_b32 s5, s5, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffe8, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffe8, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
@@ -2193,10 +2193,10 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s5, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffe8, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffe8, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
@@ -2294,23 +2294,23 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    s_and_b32 s4, s11, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 0xffffffe8, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX10-NEXT:    s_or_b32 s2, s2, s4
 ; GFX10-NEXT:    s_and_b32 s4, s13, 0xff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 0xffffffe8, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v1
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX10-NEXT:    s_or_b32 s3, s3, s4
 ; GFX10-NEXT:    s_lshl_b32 s4, s7, 17
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 0xffffffe8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    s_or_b32 s0, s4, s0
@@ -2393,69 +2393,67 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX11-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX11-NEXT:    s_lshr_b32 s13, s3, 8
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_and_b32 s13, s13, 0xff
+; GFX11-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX11-NEXT:    s_or_b32 s0, s0, s6
 ; GFX11-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX11-NEXT:    s_or_b32 s3, s12, s3
 ; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT:    s_lshr_b32 s13, s3, 8
-; GFX11-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX11-NEXT:    s_and_b32 s13, s13, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_or_b32 s3, s12, s3
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
 ; GFX11-NEXT:    s_lshl_b32 s4, s10, 8
 ; GFX11-NEXT:    s_and_b32 s10, 0xffff, s13
 ; GFX11-NEXT:    s_or_b32 s2, s2, s4
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0xffffffe8, v1
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX11-NEXT:    s_lshl_b32 s4, s9, 16
 ; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT:    s_lshl_b32 s5, s10, 16
 ; GFX11-NEXT:    s_or_b32 s2, s2, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, 0xffffffe8, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX11-NEXT:    s_lshl_b32 s4, s7, 17
-; GFX11-NEXT:    s_lshl_b32 s5, s10, 16
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xffffffe8, v1
 ; GFX11-NEXT:    s_or_b32 s0, s4, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_add_nc_u32 v2, 0xffffffe8, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
-; GFX11-NEXT:    s_or_b32 s2, s3, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX11-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX11-NEXT:    s_or_b32 s2, s3, s5
 ; GFX11-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
 ; GFX11-NEXT:    s_lshl_b32 s0, s8, 17
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 8, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -2491,32 +2489,32 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v7, v6, v7
+; GFX6-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX6-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GFX6-NEXT:    v_mul_hi_u32 v7, v4, v6
+; GFX6-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GFX6-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX6-NEXT:    v_mul_lo_u32 v8, v8, 24
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v7
-; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v8
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v4, v7
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v4, v7
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 23, v4
-; GFX6-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 23, v4
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffffff, v8
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v8, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v5, v6
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v2, v7
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffe8, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 23, v2
@@ -2540,32 +2538,32 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v6, v7
+; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX8-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
-; GFX8-NEXT:    v_mul_hi_u32 v7, v4, v6
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT:    v_mul_hi_u32 v8, v4, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GFX8-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX8-NEXT:    v_mul_lo_u32 v8, v8, 24
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v7
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v7
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v7
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 23, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 23, v4
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffffff, v8
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v8, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v5, v6
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 24, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v2, v7
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 24, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffffe8, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 23, v2
@@ -2599,10 +2597,10 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v7
 ; GFX9-NEXT:    v_sub_u32_e32 v5, v5, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, 0xffffffe8, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, 0xffffffe8, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 23, v4
@@ -2610,10 +2608,10 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v5
+; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffe8, v5
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
+; GFX9-NEXT:    v_add_u32_e32 v4, 0xffffffe8, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 23, v2
@@ -2645,15 +2643,15 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v7
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0xffffffe8, v4
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0xffffffe8, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0xffffffe8, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0xffffffe8, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
@@ -2675,12 +2673,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX11-NEXT:    v_dual_mul_f32 v6, 0x4f7ffffe, v6 :: v_dual_lshlrev_b32 v1, 1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX11-NEXT:    v_mul_lo_u32 v7, 0xffffffe8, v6
@@ -2697,34 +2694,33 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX11-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0xffffffe8, v4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0xffffffe8, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0xffffffe8, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_add_nc_u32 v7, 0xffffffe8, v5
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v1, v4, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
@@ -6087,13 +6083,14 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_not_b32_e32 v0, v8
 ; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v15
+; GFX6-NEXT:    v_not_b32_e32 v16, 63
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[9:10], v0
 ; GFX6-NEXT:    v_lshl_b64 v[11:12], v[2:3], v15
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v15
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v15, v16
 ; GFX6-NEXT:    v_lshl_b64 v[13:14], v[9:10], v15
 ; GFX6-NEXT:    v_or_b32_e32 v11, v0, v11
 ; GFX6-NEXT:    v_or_b32_e32 v12, v1, v12
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[9:10], v16
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[9:10], v17
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v13, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v13, 0, v14, vcc
@@ -6106,7 +6103,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v14
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], v14
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[6:7], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v15, vcc, 64, v14
+; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v14, v16
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[6:7], v15
@@ -6135,13 +6132,14 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_not_b32_e32 v0, v8
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v15
+; GFX8-NEXT:    v_not_b32_e32 v16, 63
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[9:10]
 ; GFX8-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v15
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v15, v16
 ; GFX8-NEXT:    v_lshlrev_b64 v[13:14], v15, v[9:10]
 ; GFX8-NEXT:    v_or_b32_e32 v11, v0, v11
 ; GFX8-NEXT:    v_or_b32_e32 v12, v1, v12
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[9:10]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v17, v[9:10]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v13, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, 0, v14, vcc
@@ -6154,7 +6152,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v14
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, 64, v14
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v14, v16
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v15, v[6:7]
@@ -6185,7 +6183,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v15
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[9:10]
 ; GFX9-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v15
+; GFX9-NEXT:    v_add_u32_e32 v16, 0xffffffc0, v15
 ; GFX9-NEXT:    v_lshlrev_b64 v[13:14], v15, v[9:10]
 ; GFX9-NEXT:    v_or_b32_e32 v11, v0, v11
 ; GFX9-NEXT:    v_or_b32_e32 v12, v1, v12
@@ -6202,7 +6200,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v12, v1, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT:    v_add_u32_e32 v15, 0xffffffc0, v14
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v15, v[6:7]
@@ -6232,9 +6230,9 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_and_b32_e32 v18, 0x7f, v9
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v21, 64, v19
+; GFX10-NEXT:    v_add_nc_u32_e32 v21, 0xffffffc0, v19
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
+; GFX10-NEXT:    v_add_nc_u32_e32 v20, 0xffffffc0, v18
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b64 v[12:13], v19, v[4:5]
 ; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
@@ -6273,47 +6271,48 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v9, v8
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 31, v1
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0x7f, v9
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
 ; GFX11-NEXT:    v_lshlrev_b64 v[14:15], v18, v[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
 ; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v8
-; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0xffffffc0, v18
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0, v14, vcc_lo
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v21, 64, v19
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], v19, v[4:5]
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
+; GFX11-NEXT:    v_lshrrev_b64 v[12:13], v19, v[4:5]
 ; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v19
 ; GFX11-NEXT:    v_or_b32_e32 v10, v10, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0xffffffc0, v19
+; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
 ; GFX11-NEXT:    v_or_b32_e32 v11, v11, v9
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v21, v[6:7]
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v21, v[6:7]
 ; GFX11-NEXT:    v_or_b32_e32 v12, v12, v16
 ; GFX11-NEXT:    v_or_b32_e32 v13, v13, v17
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v19, v[6:7]
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0, v15, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v18
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v9, v13, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0, v15, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v14, v4
 ; GFX11-NEXT:    v_or_b32_e32 v1, v7, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -6335,46 +6334,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v1
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 64, v7
+; GFX6-NEXT:    v_not_b32_e32 v8, 63
 ; GFX6-NEXT:    v_lshr_b64 v[1:2], s[8:9], v1
 ; GFX6-NEXT:    v_lshl_b64 v[3:4], s[0:1], v7
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v7
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v7, v8
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], s[8:9], v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v4, v2, v4
-; GFX6-NEXT:    v_lshl_b64 v[1:2], s[8:9], v8
+; GFX6-NEXT:    v_lshl_b64 v[1:2], s[8:9], v9
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX6-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX6-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v10
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v10
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, v2, v4, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v11
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v11
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], s[6:7], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v10
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[6:7], v11
-; GFX6-NEXT:    v_lshr_b64 v[4:5], s[6:7], v10
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[6:7], v8
+; GFX6-NEXT:    v_lshr_b64 v[4:5], s[6:7], v11
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v11
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v11
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_ssv:
@@ -6387,46 +6387,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v1
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 64, v7
+; GFX8-NEXT:    v_not_b32_e32 v8, 63
 ; GFX8-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
 ; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v7, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v4, v2, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, s[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v9, s[8:9]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX8-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v10
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v10, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v2, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v11
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v10
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v11, v8
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v10, s[6:7]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v8, s[6:7]
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v11, s[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_ssv:
@@ -6441,7 +6442,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v7
 ; GFX9-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
 ; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
-; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v7
+; GFX9-NEXT:    v_add_u32_e32 v8, 0xffffffc0, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_or_b32_e32 v4, v2, v4
@@ -6460,7 +6461,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v10, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v10
+; GFX9-NEXT:    v_add_u32_e32 v11, 0xffffffc0, v10
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
@@ -6492,10 +6493,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
 ; GFX10-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
+; GFX10-NEXT:    v_add_nc_u32_e32 v14, 0xffffffc0, v13
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[8:9]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v12
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v13, s[4:5]
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
@@ -6544,11 +6545,11 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX11-NEXT:    v_and_b32_e32 v13, 0x7f, v0
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[8:9]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v12
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0xffffffc0, v13
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v13, s[4:5]
 ; GFX11-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
@@ -7718,13 +7719,14 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_not_b32_e32 v0, v16
 ; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v19
+; GFX6-NEXT:    v_not_b32_e32 v25, 63
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[17:18], v0
 ; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v19
-; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v19
+; GFX6-NEXT:    v_add_i32_e32 v26, vcc, v19, v25
 ; GFX6-NEXT:    v_lshl_b64 v[23:24], v[17:18], v19
 ; GFX6-NEXT:    v_or_b32_e32 v21, v0, v21
 ; GFX6-NEXT:    v_or_b32_e32 v22, v1, v22
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[17:18], v25
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[17:18], v26
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
 ; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v23, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v23, 0, v24, vcc
@@ -7737,7 +7739,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v22
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v22
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[10:11], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v22
+; GFX6-NEXT:    v_add_i32_e32 v24, vcc, v22, v25
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v24
@@ -7761,7 +7763,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[8:9], v4
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v18
 ; GFX6-NEXT:    v_or_b32_e32 v2, v19, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v19, vcc, 64, v18
+; GFX6-NEXT:    v_add_i32_e32 v19, vcc, v18, v25
 ; GFX6-NEXT:    v_lshl_b64 v[16:17], v[8:9], v18
 ; GFX6-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX6-NEXT:    v_or_b32_e32 v11, v5, v11
@@ -7778,7 +7780,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v18
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], v18
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[14:15], v6
-; GFX6-NEXT:    v_subrev_i32_e32 v19, vcc, 64, v18
+; GFX6-NEXT:    v_add_i32_e32 v19, vcc, v18, v25
 ; GFX6-NEXT:    v_or_b32_e32 v6, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v7, v5, v7
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[14:15], v19
@@ -7809,13 +7811,14 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_not_b32_e32 v0, v16
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v19
+; GFX8-NEXT:    v_not_b32_e32 v25, 63
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[17:18]
 ; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v19
+; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v19, v25
 ; GFX8-NEXT:    v_lshlrev_b64 v[23:24], v19, v[17:18]
 ; GFX8-NEXT:    v_or_b32_e32 v21, v0, v21
 ; GFX8-NEXT:    v_or_b32_e32 v22, v1, v22
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[17:18]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v26, v[17:18]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v23, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, 0, v24, vcc
@@ -7828,7 +7831,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v22
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v22, v[8:9]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX8-NEXT:    v_subrev_u32_e32 v24, vcc, 64, v22
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v22, v25
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
@@ -7852,7 +7855,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v2, v19, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, 64, v18
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v18, v25
 ; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v18, v[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX8-NEXT:    v_or_b32_e32 v11, v5, v11
@@ -7869,7 +7872,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v18
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v18, v[12:13]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, 64, v18
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v18, v25
 ; GFX8-NEXT:    v_or_b32_e32 v6, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, v5, v7
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v19, v[14:15]
@@ -7902,7 +7905,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v19
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[17:18]
 ; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v19
+; GFX9-NEXT:    v_add_u32_e32 v25, 0xffffffc0, v19
 ; GFX9-NEXT:    v_lshlrev_b64 v[23:24], v19, v[17:18]
 ; GFX9-NEXT:    v_or_b32_e32 v21, v0, v21
 ; GFX9-NEXT:    v_or_b32_e32 v22, v1, v22
@@ -7919,7 +7922,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_cndmask_b32_e32 v21, v1, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v22, v[8:9]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX9-NEXT:    v_subrev_u32_e32 v24, 64, v22
+; GFX9-NEXT:    v_add_u32_e32 v24, 0xffffffc0, v22
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
@@ -7942,7 +7945,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v2, v19, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v19, 64, v18
+; GFX9-NEXT:    v_add_u32_e32 v19, 0xffffffc0, v18
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v18, v[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v10, v4, v10
@@ -7960,7 +7963,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v18, v[12:13]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX9-NEXT:    v_subrev_u32_e32 v19, 64, v18
+; GFX9-NEXT:    v_add_u32_e32 v19, 0xffffffc0, v18
 ; GFX9-NEXT:    v_or_b32_e32 v6, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v5, v7
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v19, v[14:15]
@@ -7991,11 +7994,11 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_and_b32_e32 v25, 0x7f, v17
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v27, 64, v26
+; GFX10-NEXT:    v_add_nc_u32_e32 v27, 0xffffffc0, v26
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v26
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v25
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v17
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v19, 64, v25
+; GFX10-NEXT:    v_add_nc_u32_e32 v19, 0xffffffc0, v25
 ; GFX10-NEXT:    v_lshlrev_b64 v[23:24], v25, v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
 ; GFX10-NEXT:    v_lshrrev_b64 v[17:18], v18, v[0:1]
@@ -8035,12 +8038,12 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v9, v[4:5]
 ; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v25, v[6:7]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v23
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v25
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0xffffffc0, v25
 ; GFX10-NEXT:    v_or_b32_e32 v2, v18, v2
 ; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v25, v[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v23, v[12:13]
 ; GFX10-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v23
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 0xffffffc0, v23
 ; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
 ; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
@@ -8091,41 +8094,41 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX11-NEXT:    v_cndmask_b32_e32 v24, 0, v24, vcc_lo
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v25
 ; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v25, v[2:3]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v19, 64, v25
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v27, 64, v26
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v26
 ; GFX11-NEXT:    v_lshrrev_b64 v[17:18], v18, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v19, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v22, v18, v22
-; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v26
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_nc_u32_e32 v19, 0xffffffc0, v25
 ; GFX11-NEXT:    v_or_b32_e32 v21, v17, v21
+; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v26
 ; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v26, v[8:9]
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v1, v22, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v19, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[18:19], v18, v[10:11]
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v0, v21, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v27, v[10:11]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v22, v1, v22 :: v_dual_cndmask_b32 v21, v0, v21
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0xffffffc0, v26
 ; GFX11-NEXT:    v_or_b32_e32 v16, v16, v18
 ; GFX11-NEXT:    v_or_b32_e32 v17, v17, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, v22, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v27, v[10:11]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s0
 ; GFX11-NEXT:    v_not_b32_e32 v16, v20
 ; GFX11-NEXT:    v_cndmask_b32_e32 v18, v21, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v22, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT:    v_and_b32_e32 v25, 0x7f, v16
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s0
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v26, v[10:11]
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v9 :: v_dual_cndmask_b32 v0, v0, v8
+; GFX11-NEXT:    v_and_b32_e32 v25, 0x7f, v16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 64, v25
 ; GFX11-NEXT:    v_cndmask_b32_e64 v26, 0, v3, s0
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 64, v25
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xffffffc0, v25
 ; GFX11-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX11-NEXT:    v_or_b32_e32 v0, v23, v0
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v9, v[4:5]
@@ -8143,7 +8146,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v11, 0, v16 :: v_dual_cndmask_b32 v10, v3, v10
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v20, 64, v23
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v23
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0xffffffc0, v23
 ; GFX11-NEXT:    v_lshrrev_b64 v[18:19], v23, v[12:13]
 ; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v23
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 5dd4fa0..cc185af 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1577,7 +1577,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v4, 1, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v3
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[0:1], v3
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], v8
 ; GFX6-NEXT:    v_lshr_b64 v[10:11], v[4:5], v3
@@ -1599,7 +1599,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffc0, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v8, v[4:5]
 ; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
@@ -1621,7 +1621,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 64, v3
+; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffc0, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v8, v[4:5]
 ; GFX9-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
@@ -1643,7 +1643,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 1, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v3
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v3
@@ -1664,20 +1664,20 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v4, 1, v2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v3
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v3
 ; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v3
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v2, v[4:5]
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v3, v[4:5]
 ; GFX11-NEXT:    v_or_b32_e32 v2, v6, v8
 ; GFX11-NEXT:    v_or_b32_e32 v6, v7, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v3
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v3, v[4:5]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i65 %value, %amount
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 2c2f8e9..88eb0e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -325,7 +325,7 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 0xffed2705, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
@@ -353,29 +353,29 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, v5, v3
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v3
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v7
+; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v3
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v0, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
-; GISEL-NEXT:    v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v11, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 0xffed2705, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v7, v5, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
@@ -398,29 +398,29 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v4, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v0, v3
+; CGP-NEXT:    v_mul_lo_u32 v7, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CGP-NEXT:    v_mul_lo_u32 v7, v4, v5
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
-; CGP-NEXT:    v_mul_lo_u32 v9, v3, v5
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
+; CGP-NEXT:    v_mul_lo_u32 v8, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v3, v5
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v0, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v0, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[6:7]
-; CGP-NEXT:    v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[6:7]
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, 0xffed2705, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v3
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 4cf1c92..b12e915 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1583,7 +1583,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v3
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], v4
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], v[2:3], v3
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v3
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0xffffffc0, v3
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[0:1], v3
 ; GFX6-NEXT:    v_or_b32_e32 v9, v4, v5
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[0:1], v8
@@ -1601,7 +1601,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v4, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v3, v[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v3
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0xffffffc0, v3
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
 ; GFX8-NEXT:    v_or_b32_e32 v9, v4, v5
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[0:1]
@@ -1619,7 +1619,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v4, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v3, v[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v3
+; GFX9-NEXT:    v_add_u32_e32 v8, 0xffffffc0, v3
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
 ; GFX9-NEXT:    v_or_b32_e32 v9, v4, v5
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[0:1]
@@ -1636,7 +1636,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 64, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 0xffffffc0, v3
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
 ; GFX10-NEXT:    v_lshrrev_b64 v[5:6], v6, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
@@ -1654,7 +1654,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 64, v3
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0xffffffc0, v3
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
 ; GFX11-NEXT:    v_lshrrev_b64 v[5:6], v6, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index 1bb606f..2b12e4b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -268,10 +268,10 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) {
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 12, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, 0x1000, v0
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xfffff000, v0
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, 0x1000, v0
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xfffff000, v0
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -297,23 +297,23 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 12, v5
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 12, v7
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 12, v4
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v0, v3
-; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, 0x1000, v1
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v0, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 0xfffff000, v1
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v0, v3
-; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, 0x1000, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v0, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0xfffff000, v1
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
@@ -338,23 +338,23 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v4, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v0, v3
+; CGP-NEXT:    v_mul_lo_u32 v7, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CGP-NEXT:    v_lshlrev_b32_e32 v4, 12, v4
+; CGP-NEXT:    v_lshlrev_b32_e32 v7, 12, v7
 ; CGP-NEXT:    v_lshlrev_b32_e32 v3, 12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v5
-; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x1000, v1
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, 0xfffff000, v1
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v5
-; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x1000, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff000, v1
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
@@ -386,10 +386,10 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xffed2705, v0
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xffed2705, v0
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -415,23 +415,23 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v3
+; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v3
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v0, v3
-; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, 0x12d8fb, v1
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v0, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 0xffed2705, v1
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v0, v3
-; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, 0x12d8fb, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v0, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0xffed2705, v1
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
@@ -456,23 +456,23 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v4, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v0, v3
+; CGP-NEXT:    v_mul_lo_u32 v7, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v5
+; CGP-NEXT:    v_mul_lo_u32 v7, v7, v5
 ; CGP-NEXT:    v_mul_lo_u32 v3, v3, v5
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v5
-; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, 0xffed2705, v1
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v5
-; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 0xffed2705, v1
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 2572f85..7214f4a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -12,9 +12,9 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
 ; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -27,9 +27,9 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
+; GFX8-NEXT:    v_add_u16_e32 v2, 0x8001, v2
 ; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
+; GFX8-NEXT:    v_add_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
@@ -63,9 +63,9 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
 ; GFX6-NEXT:    s_max_i32 s2, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
-; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s3, s0, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
+; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s2, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s3
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
@@ -79,11 +79,11 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX8-NEXT:    s_sext_i32_i16 s3, -1
 ; GFX8-NEXT:    s_max_i32 s4, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 9
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s4, 0x8001
 ; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX8-NEXT:    s_max_i32 s1, s3, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
@@ -122,9 +122,9 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
 ; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -137,9 +137,9 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
+; GFX8-NEXT:    v_add_u16_e32 v2, 0x8001, v2
 ; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
+; GFX8-NEXT:    v_add_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
@@ -173,9 +173,9 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX6-NEXT:    s_max_i32 s2, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s3, s0, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
+; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s2, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s3
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
@@ -189,11 +189,11 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX8-NEXT:    s_sext_i32_i16 s3, -1
 ; GFX8-NEXT:    s_max_i32 s4, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s4, 0x8001
 ; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX8-NEXT:    s_max_i32 s1, s3, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
@@ -234,18 +234,19 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000001, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_max_i32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000001, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x80000000, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000000, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
@@ -266,16 +267,16 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_max_i16_e32 v4, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x7fff, v4
+; GFX8-NEXT:    v_add_u16_e32 v4, 0x8001, v4
 ; GFX8-NEXT:    v_min_i16_e32 v5, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v5, 0x8000, v5
+; GFX8-NEXT:    v_add_u16_e32 v5, 0x8000, v5
 ; GFX8-NEXT:    v_max_i16_e32 v1, v4, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v5
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_max_i16_e32 v1, -1, v3
-; GFX8-NEXT:    v_subrev_u16_e32 v1, 0x7fff, v1
+; GFX8-NEXT:    v_add_u16_e32 v1, 0x8001, v1
 ; GFX8-NEXT:    v_min_i16_e32 v4, -1, v3
-; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x8000, v4
+; GFX8-NEXT:    v_add_u16_e32 v4, 0x8000, v4
 ; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
@@ -355,18 +356,18 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_max_i32 s4, s0, -1
 ; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s5, s0, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s4, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s5
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX6-NEXT:    s_max_i32 s3, s1, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s4, s1, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s2, s3, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
@@ -387,11 +388,11 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_max_i32 s6, s4, s5
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s6, 0x8001
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s4, 0x8000
 ; GFX8-NEXT:    s_max_i32 s1, s6, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
@@ -401,11 +402,11 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, 8
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s1
 ; GFX8-NEXT:    s_max_i32 s4, s3, s5
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s4, 0x8001
 ; GFX8-NEXT:    s_min_i32 s3, s3, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s3, 0x8000
 ; GFX8-NEXT:    s_max_i32 s2, s4, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
@@ -509,29 +510,29 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x80000001, v8
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v11
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GFX6-NEXT:    v_max_i32_e32 v1, v8, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
-; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
+; GFX6-NEXT:    v_mov_b32_e32 v9, 0x80000001
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v11
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; GFX6-NEXT:    v_max_i32_e32 v3, v5, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
@@ -539,10 +540,10 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
@@ -573,34 +574,34 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_add_u16_e32 v8, 0x8001, v8
 ; GFX8-NEXT:    v_min_i16_e32 v9, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_add_u16_e32 v9, 0x8000, v9
 ; GFX8-NEXT:    v_max_i16_e32 v1, v8, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v9
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_max_i16_e32 v1, -1, v3
-; GFX8-NEXT:    v_subrev_u16_e32 v1, 0x7fff, v1
+; GFX8-NEXT:    v_add_u16_e32 v1, 0x8001, v1
 ; GFX8-NEXT:    v_min_i16_e32 v8, -1, v3
-; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x8000, v8
+; GFX8-NEXT:    v_add_u16_e32 v8, 0x8000, v8
 ; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v8
 ; GFX8-NEXT:    v_max_i16_e32 v4, -1, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
-; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x7fff, v4
+; GFX8-NEXT:    v_add_u16_e32 v4, 0x8001, v4
 ; GFX8-NEXT:    v_min_i16_e32 v6, -1, v2
-; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x8000, v6
+; GFX8-NEXT:    v_add_u16_e32 v6, 0x8000, v6
 ; GFX8-NEXT:    v_max_i16_e32 v3, v4, v3
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v6
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v5
 ; GFX8-NEXT:    v_max_i16_e32 v5, -1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
-; GFX8-NEXT:    v_subrev_u16_e32 v5, 0x7fff, v5
+; GFX8-NEXT:    v_add_u16_e32 v5, 0x8001, v5
 ; GFX8-NEXT:    v_min_i16_e32 v6, -1, v3
-; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x8000, v6
+; GFX8-NEXT:    v_add_u16_e32 v6, 0x8000, v6
 ; GFX8-NEXT:    v_max_i16_e32 v4, v5, v4
 ; GFX8-NEXT:    v_min_i16_e32 v4, v4, v6
 ; GFX8-NEXT:    v_sub_u16_e32 v3, v3, v4
@@ -727,27 +728,27 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_sub_i32 s8, s8, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s9, s0, -1
-; GFX6-NEXT:    s_sub_i32 s9, s9, 0x80000000
+; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s8, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s9
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
 ; GFX6-NEXT:    s_max_i32 s5, s1, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s8, s1, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
+; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s2, s5, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s8
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX6-NEXT:    s_max_i32 s5, s2, -1
 ; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s6, s2, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s3, s5, s3
 ; GFX6-NEXT:    s_min_i32 s3, s3, s6
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s3
@@ -755,10 +756,10 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_max_i32 s5, s3, -1
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s6, s3, -1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
-; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 24
@@ -789,11 +790,11 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_sub_i32 s10, s10, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s10, 0x8001
 ; GFX8-NEXT:    s_min_i32 s8, s8, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s10
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s8, 0x8000
 ; GFX8-NEXT:    s_max_i32 s1, s10, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
@@ -803,11 +804,11 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshl_b32 s2, s5, 8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s1
 ; GFX8-NEXT:    s_max_i32 s8, s5, s9
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s8, 0x8001
 ; GFX8-NEXT:    s_min_i32 s5, s5, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s5, 0x8000
 ; GFX8-NEXT:    s_max_i32 s2, s8, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
@@ -817,11 +818,11 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX8-NEXT:    s_lshl_b32 s3, s6, 8
 ; GFX8-NEXT:    s_max_i32 s6, s5, s9
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s6, 0x8001
 ; GFX8-NEXT:    s_min_i32 s5, s5, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s5, 0x8000
 ; GFX8-NEXT:    s_max_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
@@ -831,12 +832,12 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s3
 ; GFX8-NEXT:    s_max_i32 s6, s5, s9
 ; GFX8-NEXT:    s_lshl_b32 s4, s7, 8
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s6, 0x8001
 ; GFX8-NEXT:    s_min_i32 s5, s5, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s5, 0x8000
 ; GFX8-NEXT:    s_max_i32 s4, s6, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, 8
@@ -1004,9 +1005,9 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
 ; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -1055,9 +1056,9 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_max_i32 s2, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s3, s0, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
+; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s2, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s3
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
@@ -1109,9 +1110,9 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
 ; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -1121,9 +1122,9 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v2, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x80000001, v2
 ; GFX8-NEXT:    v_min_i32_e32 v3, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 0x80000000, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x80000000, v3
 ; GFX8-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
@@ -1148,9 +1149,9 @@ define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_max_i32 s2, s0, -1
-; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s3, s0, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
+; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s2, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s3
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
@@ -1159,9 +1160,9 @@ define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
 ; GFX8-LABEL: s_ssubsat_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_max_i32 s2, s0, -1
-; GFX8-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s2, s2, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s3, s0, -1
-; GFX8-NEXT:    s_sub_i32 s3, s3, 0x80000000
+; GFX8-NEXT:    s_add_i32 s3, s3, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s1, s2, s1
 ; GFX8-NEXT:    s_min_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
@@ -1187,9 +1188,9 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
 ; GFX6-LABEL: ssubsat_i32_sv:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_max_i32 s1, s0, -1
-; GFX6-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s1, s1, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s2, s0, -1
-; GFX6-NEXT:    s_sub_i32 s2, s2, 0x80000000
+; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000000
 ; GFX6-NEXT:    v_max_i32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
@@ -1198,9 +1199,9 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
 ; GFX8-LABEL: ssubsat_i32_sv:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_max_i32 s1, s0, -1
-; GFX8-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s1, s1, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s2, s0, -1
-; GFX8-NEXT:    s_sub_i32 s2, s2, 0x80000000
+; GFX8-NEXT:    s_add_i32 s2, s2, 0x80000000
 ; GFX8-NEXT:    v_max_i32_e32 v0, s1, v0
 ; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
@@ -1224,9 +1225,9 @@ define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
 ; GFX6-LABEL: ssubsat_i32_vs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_max_i32_e32 v1, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 0x80000001, v1
 ; GFX6-NEXT:    v_min_i32_e32 v2, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x80000000, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000000, v2
 ; GFX6-NEXT:    v_max_i32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -1235,9 +1236,9 @@ define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
 ; GFX8-LABEL: ssubsat_i32_vs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_max_i32_e32 v1, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 0x7fffffff, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x80000001, v1
 ; GFX8-NEXT:    v_min_i32_e32 v2, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 0x80000000, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x80000000, v2
 ; GFX8-NEXT:    v_max_i32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
@@ -1262,16 +1263,16 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000001, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 0x80000000, v5
 ; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
 ; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x80000000, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000000, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
@@ -1281,16 +1282,16 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v4, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x80000001, v4
 ; GFX8-NEXT:    v_min_i32_e32 v5, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, 0x80000000, v5
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x80000000, v5
 ; GFX8-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_min_i32_e32 v2, v2, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_max_i32_e32 v2, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x80000001, v2
 ; GFX8-NEXT:    v_min_i32_e32 v4, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 0x80000000, v4
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x80000000, v4
 ; GFX8-NEXT:    v_max_i32_e32 v2, v2, v3
 ; GFX8-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v2
@@ -1317,16 +1318,16 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 ; GFX6-LABEL: s_ssubsat_v2i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_max_i32 s4, s0, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s5, s0, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s2, s4, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s5
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX6-NEXT:    s_max_i32 s2, s1, -1
-; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s4, s1, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s2, s2, s3
 ; GFX6-NEXT:    s_min_i32 s2, s2, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
@@ -1335,16 +1336,16 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 ; GFX8-LABEL: s_ssubsat_v2i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_max_i32 s4, s0, -1
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s5, s0, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s2, s4, s2
 ; GFX8-NEXT:    s_min_i32 s2, s2, s5
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX8-NEXT:    s_max_i32 s2, s1, -1
-; GFX8-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s2, s2, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s4, s1, -1
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s2, s2, s3
 ; GFX8-NEXT:    s_min_i32 s2, s2, s4
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s2
@@ -1376,24 +1377,25 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v6, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x7fffffff, v6
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0x80000001, v6
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 0x80000000, v8
+; GFX6-NEXT:    v_bfrev_b32_e32 v9, 1
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v8
-; GFX6-NEXT:    v_bfrev_b32_e32 v7, -2
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x80000001
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x80000000, v6
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
 ; GFX6-NEXT:    v_max_i32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000001, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x80000000, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000000, v4
 ; GFX6-NEXT:    v_max_i32_e32 v3, v3, v5
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
@@ -1403,24 +1405,25 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v6, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 0x7fffffff, v6
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x80000001, v6
 ; GFX8-NEXT:    v_min_i32_e32 v8, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 0x80000000, v8
+; GFX8-NEXT:    v_bfrev_b32_e32 v9, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
 ; GFX8-NEXT:    v_max_i32_e32 v3, v6, v3
 ; GFX8-NEXT:    v_min_i32_e32 v3, v3, v8
-; GFX8-NEXT:    v_bfrev_b32_e32 v7, -2
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x80000001
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, -1, v1
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v7
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
 ; GFX8-NEXT:    v_min_i32_e32 v6, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 0x80000000, v6
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
 ; GFX8-NEXT:    v_max_i32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, -1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 0x7fffffff, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x80000001, v3
 ; GFX8-NEXT:    v_min_i32_e32 v4, -1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 0x80000000, v4
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x80000000, v4
 ; GFX8-NEXT:    v_max_i32_e32 v3, v3, v5
 ; GFX8-NEXT:    v_min_i32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
@@ -1449,23 +1452,23 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 ; GFX6-LABEL: s_ssubsat_v3i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_max_i32 s6, s0, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s7, s0, -1
-; GFX6-NEXT:    s_sub_i32 s7, s7, 0x80000000
+; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s3, s6, s3
 ; GFX6-NEXT:    s_min_i32 s3, s3, s7
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s3
 ; GFX6-NEXT:    s_max_i32 s3, s1, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s6, s1, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s3, s3, s4
 ; GFX6-NEXT:    s_min_i32 s3, s3, s6
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX6-NEXT:    s_max_i32 s3, s2, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s4, s2, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s3, s3, s5
 ; GFX6-NEXT:    s_min_i32 s3, s3, s4
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s3
@@ -1474,23 +1477,23 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 ; GFX8-LABEL: s_ssubsat_v3i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_max_i32 s6, s0, -1
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s6, s6, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s7, s0, -1
-; GFX8-NEXT:    s_sub_i32 s7, s7, 0x80000000
+; GFX8-NEXT:    s_add_i32 s7, s7, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s3, s6, s3
 ; GFX8-NEXT:    s_min_i32 s3, s3, s7
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s3
 ; GFX8-NEXT:    s_max_i32 s3, s1, -1
-; GFX8-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s3, s3, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s6, s1, -1
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX8-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s3, s3, s4
 ; GFX8-NEXT:    s_min_i32 s3, s3, s6
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX8-NEXT:    s_max_i32 s3, s2, -1
-; GFX8-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s3, s3, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s4, s2, -1
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s3, s3, s5
 ; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s3
@@ -1527,32 +1530,32 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x80000001, v8
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v11
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v10
-; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
+; GFX6-NEXT:    v_mov_b32_e32 v9, 0x80000001
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v9
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 0x80000000, v8
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v9
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000001, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 0x80000000, v5
 ; GFX6-NEXT:    v_max_i32_e32 v4, v4, v7
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
@@ -1562,32 +1565,32 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v8, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 0x7fffffff, v8
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x80000001, v8
 ; GFX8-NEXT:    v_min_i32_e32 v10, -1, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v11, 1
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v10, v11
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
 ; GFX8-NEXT:    v_max_i32_e32 v4, v8, v4
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v10
-; GFX8-NEXT:    v_bfrev_b32_e32 v9, -2
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0x80000001
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, -1, v1
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v9
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v9
 ; GFX8-NEXT:    v_min_i32_e32 v8, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 0x80000000, v8
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v11
 ; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, -1, v2
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v9
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v9
 ; GFX8-NEXT:    v_min_i32_e32 v5, -1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, 0x80000000, v5
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v11
 ; GFX8-NEXT:    v_max_i32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, -1, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x80000001, v4
 ; GFX8-NEXT:    v_min_i32_e32 v5, -1, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, 0x80000000, v5
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x80000000, v5
 ; GFX8-NEXT:    v_max_i32_e32 v4, v4, v7
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v4
@@ -1618,30 +1621,30 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 ; GFX6-LABEL: s_ssubsat_v4i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_max_i32 s8, s0, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s9, s0, -1
-; GFX6-NEXT:    s_sub_i32 s9, s9, 0x80000000
+; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s8, s4
 ; GFX6-NEXT:    s_min_i32 s4, s4, s9
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX6-NEXT:    s_max_i32 s4, s1, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s8, s1, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
+; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s4, s5
 ; GFX6-NEXT:    s_min_i32 s4, s4, s8
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX6-NEXT:    s_max_i32 s4, s2, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s5, s2, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s4, s6
 ; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s4
 ; GFX6-NEXT:    s_max_i32 s4, s3, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s5, s3, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s4, s7
 ; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
@@ -1650,30 +1653,30 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 ; GFX8-LABEL: s_ssubsat_v4i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_max_i32 s8, s0, -1
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s8, s8, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s9, s0, -1
-; GFX8-NEXT:    s_sub_i32 s9, s9, 0x80000000
+; GFX8-NEXT:    s_add_i32 s9, s9, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s4, s8, s4
 ; GFX8-NEXT:    s_min_i32 s4, s4, s9
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX8-NEXT:    s_max_i32 s4, s1, -1
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s8, s1, -1
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0x80000000
+; GFX8-NEXT:    s_add_i32 s8, s8, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s4, s4, s5
 ; GFX8-NEXT:    s_min_i32 s4, s4, s8
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX8-NEXT:    s_max_i32 s4, s2, -1
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s5, s2, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s4
 ; GFX8-NEXT:    s_max_i32 s4, s3, -1
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s5, s3, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s4, s4, s7
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s4
@@ -1715,39 +1718,39 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v10, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, 0x7fffffff, v10
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, 0x80000001, v10
 ; GFX6-NEXT:    v_min_i32_e32 v12, -1, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v13
+; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GFX6-NEXT:    v_max_i32_e32 v5, v10, v5
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v12
-; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
+; GFX6-NEXT:    v_mov_b32_e32 v11, 0x80000001
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v13
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x80000000, v6
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x80000000, v6
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v8
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v4
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x7fffffff, v5
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 0x80000001, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v4
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x80000000, v6
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v5
@@ -1757,39 +1760,39 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v10, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, 0x7fffffff, v10
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x80000001, v10
 ; GFX8-NEXT:    v_min_i32_e32 v12, -1, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
-; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, v12, v13
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v13
 ; GFX8-NEXT:    v_max_i32_e32 v5, v10, v5
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v12
-; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
+; GFX8-NEXT:    v_mov_b32_e32 v11, 0x80000001
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v1
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v11
 ; GFX8-NEXT:    v_min_i32_e32 v10, -1, v1
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v10, v13
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v13
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v10
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v2
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v11
 ; GFX8-NEXT:    v_min_i32_e32 v6, -1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 0x80000000, v6
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v13
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v3
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v11
 ; GFX8-NEXT:    v_min_i32_e32 v6, -1, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 0x80000000, v6
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v13
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v8
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v4
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, 0x7fffffff, v5
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x80000001, v5
 ; GFX8-NEXT:    v_min_i32_e32 v6, -1, v4
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 0x80000000, v6
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x80000000, v6
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v9
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v5
@@ -1822,37 +1825,37 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 ; GFX6-LABEL: s_ssubsat_v5i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_max_i32 s10, s0, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s11, s0, -1
-; GFX6-NEXT:    s_sub_i32 s11, s11, 0x80000000
+; GFX6-NEXT:    s_add_i32 s11, s11, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s5, s10, s5
 ; GFX6-NEXT:    s_min_i32 s5, s5, s11
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s5
 ; GFX6-NEXT:    s_max_i32 s5, s1, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s10, s1, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
+; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s5, s5, s6
 ; GFX6-NEXT:    s_min_i32 s5, s5, s10
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s5
 ; GFX6-NEXT:    s_max_i32 s5, s2, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s6, s2, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s5, s5, s7
 ; GFX6-NEXT:    s_min_i32 s5, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s5
 ; GFX6-NEXT:    s_max_i32 s5, s3, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s6, s3, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s5, s5, s8
 ; GFX6-NEXT:    s_min_i32 s5, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s5
 ; GFX6-NEXT:    s_max_i32 s5, s4, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s6, s4, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s5, s5, s9
 ; GFX6-NEXT:    s_min_i32 s5, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s5
@@ -1861,37 +1864,37 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 ; GFX8-LABEL: s_ssubsat_v5i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_max_i32 s10, s0, -1
-; GFX8-NEXT:    s_sub_i32 s10, s10, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s10, s10, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s11, s0, -1
-; GFX8-NEXT:    s_sub_i32 s11, s11, 0x80000000
+; GFX8-NEXT:    s_add_i32 s11, s11, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s5, s10, s5
 ; GFX8-NEXT:    s_min_i32 s5, s5, s11
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s5
 ; GFX8-NEXT:    s_max_i32 s5, s1, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s10, s1, -1
-; GFX8-NEXT:    s_sub_i32 s10, s10, 0x80000000
+; GFX8-NEXT:    s_add_i32 s10, s10, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s5, s5, s6
 ; GFX8-NEXT:    s_min_i32 s5, s5, s10
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s5
 ; GFX8-NEXT:    s_max_i32 s5, s2, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s6, s2, -1
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX8-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s5, s5, s7
 ; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s5
 ; GFX8-NEXT:    s_max_i32 s5, s3, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s6, s3, -1
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX8-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s5, s5, s8
 ; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s5
 ; GFX8-NEXT:    s_max_i32 s5, s4, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s6, s4, -1
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX8-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s5, s5, s9
 ; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s5
@@ -1938,117 +1941,117 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v32, -1, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v31, -2
-; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v32, v31
+; GFX6-NEXT:    v_mov_b32_e32 v31, 0x80000001
+; GFX6-NEXT:    v_add_i32_e32 v32, vcc, v32, v31
 ; GFX6-NEXT:    v_max_i32_e32 v32, v32, v16
 ; GFX6-NEXT:    v_min_i32_e32 v33, -1, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
-; GFX6-NEXT:    v_sub_i32_e32 v33, vcc, v33, v16
+; GFX6-NEXT:    v_add_i32_e32 v33, vcc, v33, v16
 ; GFX6-NEXT:    v_min_i32_e32 v32, v32, v33
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v32
 ; GFX6-NEXT:    v_max_i32_e32 v32, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v32, v31
+; GFX6-NEXT:    v_add_i32_e32 v32, vcc, v32, v31
 ; GFX6-NEXT:    v_max_i32_e32 v17, v32, v17
 ; GFX6-NEXT:    v_min_i32_e32 v32, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v32, v16
+; GFX6-NEXT:    v_add_i32_e32 v32, vcc, v32, v16
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v32
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_min_i32_e32 v18, -1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v18, -1, v3
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v19
-; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v4
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v18, -1, v4
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v5
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v18, -1, v5
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v21
-; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v6
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v18, -1, v6
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v22
-; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    buffer_load_dword v18, off, s[0:3], s32
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v7
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v19, -1, v7
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v23
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT:    v_add_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v8
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v19, -1, v8
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT:    v_add_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v24
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v9
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v19, -1, v9
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT:    v_add_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v25
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v10
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v19, -1, v10
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT:    v_add_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v26
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v11
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v19, -1, v11
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT:    v_add_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v27
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v12
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v19, -1, v12
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT:    v_add_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v28
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v13
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v19, -1, v13
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT:    v_add_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v29
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v14
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v19, -1, v14
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT:    v_add_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v30
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v15
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
 ; GFX6-NEXT:    v_min_i32_e32 v19, -1, v15
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v19, v16
+; GFX6-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_min_i32_e32 v16, v17, v16
@@ -2059,117 +2062,117 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v32, -1, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v31, -2
-; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT:    v_mov_b32_e32 v31, 0x80000001
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
 ; GFX8-NEXT:    v_max_i32_e32 v32, v32, v16
 ; GFX8-NEXT:    v_min_i32_e32 v33, -1, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
-; GFX8-NEXT:    v_sub_u32_e32 v33, vcc, v33, v16
+; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v16
 ; GFX8-NEXT:    v_min_i32_e32 v32, v32, v33
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v32
 ; GFX8-NEXT:    v_max_i32_e32 v32, -1, v1
-; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
 ; GFX8-NEXT:    v_max_i32_e32 v17, v32, v17
 ; GFX8-NEXT:    v_min_i32_e32 v32, -1, v1
-; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v32, v16
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v16
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v32
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v2
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_min_i32_e32 v18, -1, v2
-; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v3
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v18, -1, v3
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v19
-; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v4
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v18, -1, v4
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v5
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v18, -1, v5
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v21
-; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v6
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v18, -1, v6
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v22
-; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v7
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v19, -1, v7
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v23
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, v7, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v8
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v19, -1, v8
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v24
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, v8, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v9
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v19, -1, v9
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v25
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, v9, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v10
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v19, -1, v10
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v26
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v10, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v11
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v19, -1, v11
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v27
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, v11, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v12
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v19, -1, v12
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v28
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, v12, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v13
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v19, -1, v13
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v29
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v13, vcc, v13, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v14
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v19, -1, v14
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v30
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, v14, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v15
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
 ; GFX8-NEXT:    v_min_i32_e32 v19, -1, v15
-; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v19, v16
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v19, v16
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_min_i32_e32 v16, v17, v16
@@ -2252,114 +2255,114 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 ; GFX6-LABEL: s_ssubsat_v16i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_max_i32 s32, s0, -1
-; GFX6-NEXT:    s_sub_i32 s32, s32, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s32, s32, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s33, s0, -1
-; GFX6-NEXT:    s_sub_i32 s33, s33, 0x80000000
+; GFX6-NEXT:    s_add_i32 s33, s33, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s32, s16
 ; GFX6-NEXT:    s_min_i32 s16, s16, s33
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s16
 ; GFX6-NEXT:    s_max_i32 s16, s1, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s32, s1, -1
-; GFX6-NEXT:    s_sub_i32 s32, s32, 0x80000000
+; GFX6-NEXT:    s_add_i32 s32, s32, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s17
 ; GFX6-NEXT:    s_min_i32 s16, s16, s32
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s16
 ; GFX6-NEXT:    s_max_i32 s16, s2, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s2, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s18
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s16
 ; GFX6-NEXT:    s_max_i32 s16, s3, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s3, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s19
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s16
 ; GFX6-NEXT:    s_max_i32 s16, s4, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s4, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s20
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s16
 ; GFX6-NEXT:    s_max_i32 s16, s5, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s5, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s21
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s16
 ; GFX6-NEXT:    s_max_i32 s16, s6, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s6, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s22
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s16
 ; GFX6-NEXT:    s_max_i32 s16, s7, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s7, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s23
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s16
 ; GFX6-NEXT:    s_max_i32 s16, s8, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s8, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s24
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s16
 ; GFX6-NEXT:    s_max_i32 s16, s9, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s9, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s25
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
 ; GFX6-NEXT:    s_max_i32 s16, s10, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s10, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s26
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s16
 ; GFX6-NEXT:    s_max_i32 s16, s11, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s11, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s27
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s11, s11, s16
 ; GFX6-NEXT:    s_max_i32 s16, s12, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s12, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s28
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s12, s12, s16
 ; GFX6-NEXT:    s_max_i32 s16, s13, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s13, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s29
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s13, s13, s16
 ; GFX6-NEXT:    s_max_i32 s16, s14, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s14, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s30
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s14, s14, s16
 ; GFX6-NEXT:    s_max_i32 s16, s15, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s15, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s31
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s15, s15, s16
@@ -2368,114 +2371,114 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 ; GFX8-LABEL: s_ssubsat_v16i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_max_i32 s32, s0, -1
-; GFX8-NEXT:    s_sub_i32 s32, s32, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s32, s32, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s33, s0, -1
-; GFX8-NEXT:    s_sub_i32 s33, s33, 0x80000000
+; GFX8-NEXT:    s_add_i32 s33, s33, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s32, s16
 ; GFX8-NEXT:    s_min_i32 s16, s16, s33
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s16
 ; GFX8-NEXT:    s_max_i32 s16, s1, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s32, s1, -1
-; GFX8-NEXT:    s_sub_i32 s32, s32, 0x80000000
+; GFX8-NEXT:    s_add_i32 s32, s32, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s17
 ; GFX8-NEXT:    s_min_i32 s16, s16, s32
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s16
 ; GFX8-NEXT:    s_max_i32 s16, s2, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s2, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s18
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s16
 ; GFX8-NEXT:    s_max_i32 s16, s3, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s3, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s19
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s16
 ; GFX8-NEXT:    s_max_i32 s16, s4, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s4, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s20
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s16
 ; GFX8-NEXT:    s_max_i32 s16, s5, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s5, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s21
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s5, s5, s16
 ; GFX8-NEXT:    s_max_i32 s16, s6, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s6, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s22
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s16
 ; GFX8-NEXT:    s_max_i32 s16, s7, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s7, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s23
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s7, s7, s16
 ; GFX8-NEXT:    s_max_i32 s16, s8, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s8, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s24
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s16
 ; GFX8-NEXT:    s_max_i32 s16, s9, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s9, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s25
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s16
 ; GFX8-NEXT:    s_max_i32 s16, s10, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s10, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s26
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s10, s10, s16
 ; GFX8-NEXT:    s_max_i32 s16, s11, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s11, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s27
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s11, s11, s16
 ; GFX8-NEXT:    s_max_i32 s16, s12, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s12, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s28
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s12, s12, s16
 ; GFX8-NEXT:    s_max_i32 s16, s13, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s13, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s29
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s13, s13, s16
 ; GFX8-NEXT:    s_max_i32 s16, s14, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s14, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s30
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s14, s14, s16
 ; GFX8-NEXT:    s_max_i32 s16, s15, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX8-NEXT:    s_min_i32 s17, s15, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s31
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s15, s15, s16
@@ -2579,9 +2582,9 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
 ; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -2592,9 +2595,9 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
+; GFX8-NEXT:    v_add_u16_e32 v2, 0x8001, v2
 ; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
+; GFX8-NEXT:    v_add_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
@@ -2621,9 +2624,9 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_max_i32 s2, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s3, s0, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
+; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s2, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s3
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
@@ -2635,11 +2638,11 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s3, -1
 ; GFX8-NEXT:    s_max_i32 s4, s2, s3
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s4, 0x8001
 ; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX8-NEXT:    s_max_i32 s1, s3, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
@@ -2669,9 +2672,9 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_max_i32 s1, s0, -1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s1, s1, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s2, s0, -1
-; GFX6-NEXT:    s_sub_i32 s2, s2, 0x80000000
+; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000000
 ; GFX6-NEXT:    v_max_i32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
@@ -2683,9 +2686,9 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s2, -1
 ; GFX8-NEXT:    s_max_i32 s3, s1, s2
-; GFX8-NEXT:    s_sub_i32 s3, s3, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s3, 0x8001
 ; GFX8-NEXT:    s_min_i32 s1, s1, s2
-; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s1, 0x8000
 ; GFX8-NEXT:    v_max_i16_e32 v0, s3, v0
 ; GFX8-NEXT:    v_min_i16_e32 v0, s1, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v0, s0, v0
@@ -2711,9 +2714,9 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v1, -1, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 0x80000001, v1
 ; GFX6-NEXT:    v_min_i32_e32 v2, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x80000000, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000000, v2
 ; GFX6-NEXT:    v_max_i32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -2723,9 +2726,9 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
 ; GFX8-LABEL: ssubsat_i16_vs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_max_i16_e32 v1, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v1, 0x7fff, v1
+; GFX8-NEXT:    v_add_u16_e32 v1, 0x8001, v1
 ; GFX8-NEXT:    v_min_i16_e32 v2, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x8000, v2
+; GFX8-NEXT:    v_add_u16_e32 v2, 0x8000, v2
 ; GFX8-NEXT:    v_max_i16_e32 v1, s0, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
@@ -2752,18 +2755,19 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000001, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000001, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x80000000, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000000, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
@@ -2775,16 +2779,16 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
+; GFX8-NEXT:    v_add_u16_e32 v2, 0x8001, v2
 ; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
+; GFX8-NEXT:    v_add_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_max_i16_e32 v2, v2, v1
 ; GFX8-NEXT:    v_min_i16_e32 v2, v2, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX8-NEXT:    v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x7fff, v4
+; GFX8-NEXT:    v_add_u16_e32 v4, 0x8001, v4
 ; GFX8-NEXT:    v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
+; GFX8-NEXT:    v_add_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v2
@@ -2813,18 +2817,18 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_max_i32 s4, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s5, s0, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s2, s4, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s5
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_max_i32 s3, s1, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s4, s1, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s2, s3, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
@@ -2841,12 +2845,12 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s5, -1
 ; GFX8-NEXT:    s_max_i32 s6, s4, s5
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s6, 0x8001
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s4, 0x8000
 ; GFX8-NEXT:    s_max_i32 s1, s6, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
@@ -2855,11 +2859,11 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s2
 ; GFX8-NEXT:    s_max_i32 s4, s1, s5
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s4, 0x8001
 ; GFX8-NEXT:    s_min_i32 s1, s1, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s1, 0x8000
 ; GFX8-NEXT:    s_max_i32 s3, s4, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
@@ -2894,18 +2898,18 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_max_i32 s2, s0, -1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s3, s0, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
+; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
 ; GFX6-NEXT:    v_max_i32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s3, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
 ; GFX6-NEXT:    s_max_i32 s1, s0, -1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s1, s1, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s2, s0, -1
-; GFX6-NEXT:    s_sub_i32 s2, s2, 0x80000000
+; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000000
 ; GFX6-NEXT:    v_max_i32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
@@ -2922,18 +2926,18 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s3, -1
 ; GFX8-NEXT:    s_max_i32 s4, s2, s3
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s4, 0x8001
 ; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX8-NEXT:    v_max_i16_e32 v1, s4, v0
 ; GFX8-NEXT:    v_min_i16_e32 v1, s2, v1
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s1
 ; GFX8-NEXT:    s_max_i32 s4, s2, s3
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s4, 0x8001
 ; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v0, s2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
@@ -2962,18 +2966,20 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
-; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
+; GFX6-NEXT:    v_min_i32_e32 v4, -1, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
-; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
+; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x80000001
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v1
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
@@ -2988,17 +2994,17 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX8-LABEL: ssubsat_v2i16_vs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_max_i16_e32 v1, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v1, 0x7fff, v1
+; GFX8-NEXT:    v_add_u16_e32 v1, 0x8001, v1
 ; GFX8-NEXT:    v_min_i16_e32 v2, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x8000, v2
+; GFX8-NEXT:    v_add_u16_e32 v2, 0x8000, v2
 ; GFX8-NEXT:    v_max_i16_e32 v1, s0, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX8-NEXT:    v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x7fff, v3
+; GFX8-NEXT:    v_add_u16_e32 v3, 0x8001, v3
 ; GFX8-NEXT:    v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x8000, v2
+; GFX8-NEXT:    v_add_u16_e32 v2, 0x8000, v2
 ; GFX8-NEXT:    v_max_i16_e32 v3, s1, v3
 ; GFX8-NEXT:    v_min_i16_e32 v2, v3, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v0, v1
@@ -3038,38 +3044,38 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x80000001, v8
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v11
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v10
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
+; GFX6-NEXT:    v_mov_b32_e32 v9, 0x80000001
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v11
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
@@ -3091,28 +3097,28 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v4, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x7fff, v4
+; GFX8-NEXT:    v_add_u16_e32 v4, 0x8001, v4
 ; GFX8-NEXT:    v_min_i16_e32 v5, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v5, 0x8000, v5
+; GFX8-NEXT:    v_add_u16_e32 v5, 0x8000, v5
 ; GFX8-NEXT:    v_max_i16_e32 v4, v4, v2
 ; GFX8-NEXT:    v_min_i16_e32 v4, v4, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX8-NEXT:    v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x7fff, v6
+; GFX8-NEXT:    v_add_u16_e32 v6, 0x8001, v6
 ; GFX8-NEXT:    v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x8000, v7
+; GFX8-NEXT:    v_add_u16_e32 v7, 0x8000, v7
 ; GFX8-NEXT:    v_max_i16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v6, -1, v1
 ; GFX8-NEXT:    v_min_i16_e32 v2, v2, v7
-; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x7fff, v6
+; GFX8-NEXT:    v_add_u16_e32 v6, 0x8001, v6
 ; GFX8-NEXT:    v_min_i16_e32 v7, -1, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x8000, v7
+; GFX8-NEXT:    v_add_u16_e32 v7, 0x8000, v7
 ; GFX8-NEXT:    v_max_i16_e32 v6, v6, v3
 ; GFX8-NEXT:    v_min_i16_e32 v6, v6, v7
 ; GFX8-NEXT:    v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x7fff, v7
+; GFX8-NEXT:    v_add_u16_e32 v7, 0x8001, v7
 ; GFX8-NEXT:    v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v5, 0x8000, v5
+; GFX8-NEXT:    v_add_u16_e32 v5, 0x8000, v5
 ; GFX8-NEXT:    v_max_i16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v5
 ; GFX8-NEXT:    v_sub_u16_e32 v4, v0, v4
@@ -3147,36 +3153,36 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_max_i32 s8, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_sub_i32 s8, s8, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s9, s0, -1
-; GFX6-NEXT:    s_sub_i32 s9, s9, 0x80000000
+; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s8, s4
 ; GFX6-NEXT:    s_min_i32 s4, s4, s9
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
 ; GFX6-NEXT:    s_max_i32 s5, s1, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s8, s1, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
+; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_min_i32 s4, s4, s8
 ; GFX6-NEXT:    s_max_i32 s5, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s6, s2, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s4, s4, s6
 ; GFX6-NEXT:    s_max_i32 s5, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s6, s3, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_min_i32 s4, s4, s6
@@ -3199,12 +3205,12 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s9, -1
 ; GFX8-NEXT:    s_max_i32 s10, s8, s9
-; GFX8-NEXT:    s_sub_i32 s10, s10, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s10, 0x8001
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX8-NEXT:    s_min_i32 s8, s8, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s10
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s8, 0x8000
 ; GFX8-NEXT:    s_max_i32 s2, s10, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
@@ -3213,11 +3219,11 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s4
 ; GFX8-NEXT:    s_max_i32 s8, s2, s9
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s8, 0x8001
 ; GFX8-NEXT:    s_min_i32 s2, s2, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX8-NEXT:    s_max_i32 s6, s8, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
@@ -3225,12 +3231,12 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s2, s4, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s1
 ; GFX8-NEXT:    s_max_i32 s6, s4, s9
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s6, 0x8001
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    s_min_i32 s4, s4, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s4, 0x8000
 ; GFX8-NEXT:    s_max_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
@@ -3239,11 +3245,11 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s5
 ; GFX8-NEXT:    s_max_i32 s4, s3, s9
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s4, 0x8001
 ; GFX8-NEXT:    s_min_i32 s3, s3, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s7
-; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s3, 0x8000
 ; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
@@ -3299,57 +3305,57 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v12, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, 0x7fffffff, v12
+; GFX6-NEXT:    v_add_i32_e32 v12, vcc, 0x80000001, v12
 ; GFX6-NEXT:    v_min_i32_e32 v14, -1, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
-; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v15
+; GFX6-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v12, v6
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v14
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_bfrev_b32_e32 v13, -2
+; GFX6-NEXT:    v_mov_b32_e32 v13, 0x80000001
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; GFX6-NEXT:    v_min_i32_e32 v12, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v15
+; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v12
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v4
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
@@ -3376,40 +3382,40 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v6, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x7fff, v6
+; GFX8-NEXT:    v_add_u16_e32 v6, 0x8001, v6
 ; GFX8-NEXT:    v_min_i16_e32 v7, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x8000, v7
+; GFX8-NEXT:    v_add_u16_e32 v7, 0x8000, v7
 ; GFX8-NEXT:    v_max_i16_e32 v6, v6, v3
 ; GFX8-NEXT:    v_min_i16_e32 v6, v6, v7
 ; GFX8-NEXT:    v_mov_b32_e32 v7, -1
 ; GFX8-NEXT:    v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_add_u16_e32 v8, 0x8001, v8
 ; GFX8-NEXT:    v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_add_u16_e32 v9, 0x8000, v9
 ; GFX8-NEXT:    v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v8, -1, v1
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v9
-; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_add_u16_e32 v8, 0x8001, v8
 ; GFX8-NEXT:    v_min_i16_e32 v9, -1, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_add_u16_e32 v9, 0x8000, v9
 ; GFX8-NEXT:    v_max_i16_e32 v8, v8, v4
 ; GFX8-NEXT:    v_min_i16_e32 v8, v8, v9
 ; GFX8-NEXT:    v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x7fff, v9
+; GFX8-NEXT:    v_add_u16_e32 v9, 0x8001, v9
 ; GFX8-NEXT:    v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x8000, v10
+; GFX8-NEXT:    v_add_u16_e32 v10, 0x8000, v10
 ; GFX8-NEXT:    v_max_i16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v9, -1, v2
 ; GFX8-NEXT:    v_min_i16_e32 v4, v4, v10
-; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x7fff, v9
+; GFX8-NEXT:    v_add_u16_e32 v9, 0x8001, v9
 ; GFX8-NEXT:    v_min_i16_e32 v10, -1, v2
-; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x8000, v10
+; GFX8-NEXT:    v_add_u16_e32 v10, 0x8000, v10
 ; GFX8-NEXT:    v_max_i16_e32 v9, v9, v5
 ; GFX8-NEXT:    v_min_i16_e32 v9, v9, v10
 ; GFX8-NEXT:    v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x7fff, v10
+; GFX8-NEXT:    v_add_u16_e32 v10, 0x8001, v10
 ; GFX8-NEXT:    v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x8000, v7
+; GFX8-NEXT:    v_add_u16_e32 v7, 0x8000, v7
 ; GFX8-NEXT:    v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v5, v5, v7
 ; GFX8-NEXT:    v_sub_u16_e32 v6, v0, v6
@@ -3449,55 +3455,55 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_max_i32 s12, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_sub_i32 s12, s12, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s12, s12, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s13, s0, -1
-; GFX6-NEXT:    s_sub_i32 s13, s13, 0x80000000
+; GFX6-NEXT:    s_add_i32 s13, s13, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s6, s12, s6
 ; GFX6-NEXT:    s_min_i32 s6, s6, s13
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
 ; GFX6-NEXT:    s_max_i32 s7, s1, -1
-; GFX6-NEXT:    s_sub_i32 s7, s7, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s12, s1, -1
-; GFX6-NEXT:    s_sub_i32 s12, s12, 0x80000000
+; GFX6-NEXT:    s_add_i32 s12, s12, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s12
 ; GFX6-NEXT:    s_max_i32 s7, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
-; GFX6-NEXT:    s_sub_i32 s7, s7, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s8, s2, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
+; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
 ; GFX6-NEXT:    s_max_i32 s7, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
-; GFX6-NEXT:    s_sub_i32 s7, s7, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s8, s3, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
+; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
 ; GFX6-NEXT:    s_max_i32 s7, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
-; GFX6-NEXT:    s_sub_i32 s7, s7, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s8, s4, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
+; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
 ; GFX6-NEXT:    s_max_i32 s7, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
-; GFX6-NEXT:    s_sub_i32 s7, s7, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s8, s5, -1
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
+; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
@@ -3525,12 +3531,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s13, -1
 ; GFX8-NEXT:    s_max_i32 s14, s12, s13
-; GFX8-NEXT:    s_sub_i32 s14, s14, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s14, 0x8001
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
 ; GFX8-NEXT:    s_min_i32 s12, s12, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s14, s14
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s12, s12, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s12, 0x8000
 ; GFX8-NEXT:    s_max_i32 s3, s14, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
@@ -3539,11 +3545,11 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s6
 ; GFX8-NEXT:    s_max_i32 s12, s3, s13
-; GFX8-NEXT:    s_sub_i32 s12, s12, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s12, 0x8001
 ; GFX8-NEXT:    s_min_i32 s3, s3, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
-; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s3, 0x8000
 ; GFX8-NEXT:    s_max_i32 s9, s12, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
@@ -3551,12 +3557,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s1
 ; GFX8-NEXT:    s_max_i32 s9, s6, s13
-; GFX8-NEXT:    s_sub_i32 s9, s9, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s9, 0x8001
 ; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX8-NEXT:    s_min_i32 s6, s6, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s6, 0x8000
 ; GFX8-NEXT:    s_max_i32 s4, s9, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
@@ -3565,11 +3571,11 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s7
 ; GFX8-NEXT:    s_max_i32 s6, s4, s13
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s6, 0x8001
 ; GFX8-NEXT:    s_min_i32 s4, s4, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s10
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s4, 0x8000
 ; GFX8-NEXT:    s_max_i32 s6, s6, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
@@ -3577,12 +3583,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s2
 ; GFX8-NEXT:    s_sub_i32 s4, s7, s4
 ; GFX8-NEXT:    s_max_i32 s7, s6, s13
-; GFX8-NEXT:    s_sub_i32 s7, s7, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s7, 0x8001
 ; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
 ; GFX8-NEXT:    s_min_i32 s6, s6, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s6, 0x8000
 ; GFX8-NEXT:    s_max_i32 s5, s7, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
@@ -3591,11 +3597,11 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s8
 ; GFX8-NEXT:    s_max_i32 s6, s5, s13
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s6, 0x8001
 ; GFX8-NEXT:    s_min_i32 s5, s5, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s11
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s5, 0x8000
 ; GFX8-NEXT:    s_max_i32 s6, s6, s7
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
@@ -3648,66 +3654,66 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v16, -1, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
+; GFX6-NEXT:    v_mov_b32_e32 v17, 0x80000001
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v16, v17
+; GFX6-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
 ; GFX6-NEXT:    v_min_i32_e32 v18, -1, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
-; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v19
+; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v16, v8
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v18
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_min_i32_e32 v16, -1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v16, v19
+; GFX6-NEXT:    v_add_i32_e32 v16, vcc, v16, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v16
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v4
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v5
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v6
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
@@ -3715,10 +3721,10 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
@@ -3750,52 +3756,52 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v8, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_add_u16_e32 v8, 0x8001, v8
 ; GFX8-NEXT:    v_min_i16_e32 v9, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_add_u16_e32 v9, 0x8000, v9
 ; GFX8-NEXT:    v_max_i16_e32 v8, v8, v4
 ; GFX8-NEXT:    v_min_i16_e32 v8, v8, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v9, -1
 ; GFX8-NEXT:    v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x7fff, v10
+; GFX8-NEXT:    v_add_u16_e32 v10, 0x8001, v10
 ; GFX8-NEXT:    v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x8000, v11
+; GFX8-NEXT:    v_add_u16_e32 v11, 0x8000, v11
 ; GFX8-NEXT:    v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v10, -1, v1
 ; GFX8-NEXT:    v_min_i16_e32 v4, v4, v11
-; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x7fff, v10
+; GFX8-NEXT:    v_add_u16_e32 v10, 0x8001, v10
 ; GFX8-NEXT:    v_min_i16_e32 v11, -1, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x8000, v11
+; GFX8-NEXT:    v_add_u16_e32 v11, 0x8000, v11
 ; GFX8-NEXT:    v_max_i16_e32 v10, v10, v5
 ; GFX8-NEXT:    v_min_i16_e32 v10, v10, v11
 ; GFX8-NEXT:    v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x7fff, v11
+; GFX8-NEXT:    v_add_u16_e32 v11, 0x8001, v11
 ; GFX8-NEXT:    v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x8000, v12
+; GFX8-NEXT:    v_add_u16_e32 v12, 0x8000, v12
 ; GFX8-NEXT:    v_max_i16_sdwa v5, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v11, -1, v2
 ; GFX8-NEXT:    v_min_i16_e32 v5, v5, v12
-; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x7fff, v11
+; GFX8-NEXT:    v_add_u16_e32 v11, 0x8001, v11
 ; GFX8-NEXT:    v_min_i16_e32 v12, -1, v2
-; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x8000, v12
+; GFX8-NEXT:    v_add_u16_e32 v12, 0x8000, v12
 ; GFX8-NEXT:    v_max_i16_e32 v11, v11, v6
 ; GFX8-NEXT:    v_min_i16_e32 v11, v11, v12
 ; GFX8-NEXT:    v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x7fff, v12
+; GFX8-NEXT:    v_add_u16_e32 v12, 0x8001, v12
 ; GFX8-NEXT:    v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x8000, v13
+; GFX8-NEXT:    v_add_u16_e32 v13, 0x8000, v13
 ; GFX8-NEXT:    v_max_i16_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v12, -1, v3
 ; GFX8-NEXT:    v_min_i16_e32 v6, v6, v13
-; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x7fff, v12
+; GFX8-NEXT:    v_add_u16_e32 v12, 0x8001, v12
 ; GFX8-NEXT:    v_min_i16_e32 v13, -1, v3
-; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x8000, v13
+; GFX8-NEXT:    v_add_u16_e32 v13, 0x8000, v13
 ; GFX8-NEXT:    v_max_i16_e32 v12, v12, v7
 ; GFX8-NEXT:    v_min_i16_e32 v12, v12, v13
 ; GFX8-NEXT:    v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x7fff, v13
+; GFX8-NEXT:    v_add_u16_e32 v13, 0x8001, v13
 ; GFX8-NEXT:    v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_add_u16_e32 v9, 0x8000, v9
 ; GFX8-NEXT:    v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_sub_u16_e32 v8, v0, v8
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -3840,63 +3846,63 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_max_i32 s16, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s17, s0, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s16, s8
 ; GFX6-NEXT:    s_min_i32 s8, s8, s17
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
 ; GFX6-NEXT:    s_max_i32 s9, s1, -1
-; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s16, s1, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, 0x80000000
+; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s16
 ; GFX6-NEXT:    s_max_i32 s9, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s10, s2, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
+; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s10, s3, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
+; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s10, s4, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
+; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s10, s5, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
+; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s6, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s10, s6, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
+; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
@@ -3904,10 +3910,10 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
+; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
 ; GFX6-NEXT:    s_min_i32 s10, s7, -1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
+; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
@@ -3940,12 +3946,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sext_i32_i16 s16, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s17, -1
 ; GFX8-NEXT:    s_max_i32 s18, s16, s17
-; GFX8-NEXT:    s_sub_i32 s18, s18, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s18, 0x8001
 ; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s18, s18
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s16, 0x8000
 ; GFX8-NEXT:    s_max_i32 s4, s18, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s16, s16
@@ -3954,11 +3960,11 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s8
 ; GFX8-NEXT:    s_max_i32 s16, s4, s17
-; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s16, 0x8001
 ; GFX8-NEXT:    s_min_i32 s4, s4, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s16, s16
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
-; GFX8-NEXT:    s_sub_i32 s4, s4, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s4, 0x8000
 ; GFX8-NEXT:    s_max_i32 s12, s16, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
@@ -3966,12 +3972,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s4, s8, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s1
 ; GFX8-NEXT:    s_max_i32 s12, s8, s17
-; GFX8-NEXT:    s_sub_i32 s12, s12, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s12, 0x8001
 ; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    s_min_i32 s8, s8, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s8, 0x8000
 ; GFX8-NEXT:    s_max_i32 s5, s12, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
@@ -3980,11 +3986,11 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s9
 ; GFX8-NEXT:    s_max_i32 s8, s5, s17
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s8, 0x8001
 ; GFX8-NEXT:    s_min_i32 s5, s5, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s13
-; GFX8-NEXT:    s_sub_i32 s5, s5, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s5, 0x8000
 ; GFX8-NEXT:    s_max_i32 s8, s8, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
@@ -3992,12 +3998,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s2
 ; GFX8-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX8-NEXT:    s_max_i32 s9, s8, s17
-; GFX8-NEXT:    s_sub_i32 s9, s9, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s9, 0x8001
 ; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX8-NEXT:    s_min_i32 s8, s8, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s8, 0x8000
 ; GFX8-NEXT:    s_max_i32 s6, s9, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
@@ -4006,23 +4012,23 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s10
 ; GFX8-NEXT:    s_max_i32 s8, s6, s17
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s8, 0x8001
 ; GFX8-NEXT:    s_min_i32 s6, s6, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s14
-; GFX8-NEXT:    s_sub_i32 s6, s6, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s6, 0x8000
 ; GFX8-NEXT:    s_max_i32 s8, s8, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_min_i32 s6, s8, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s3
 ; GFX8-NEXT:    s_max_i32 s9, s8, s17
-; GFX8-NEXT:    s_sub_i32 s9, s9, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s9, 0x8001
 ; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
 ; GFX8-NEXT:    s_min_i32 s8, s8, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s8, 0x8000
 ; GFX8-NEXT:    s_max_i32 s7, s9, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
@@ -4031,14 +4037,14 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s11
 ; GFX8-NEXT:    s_max_i32 s8, s7, s17
-; GFX8-NEXT:    s_sub_i32 s8, s8, 0x7fff
+; GFX8-NEXT:    s_addk_i32 s8, 0x8001
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_min_i32 s7, s7, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s15
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX8-NEXT:    s_sub_i32 s7, s7, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s7, 0x8000
 ; GFX8-NEXT:    s_max_i32 s8, s8, s9
 ; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
index 8556872..6c10470 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
@@ -147,10 +147,10 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_not_b32_e32 v1, 63
-; GFX8-NEXT:    v_subrev_u16_e32 v2, 0xffc0, v0
-; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 64
+; GFX8-NEXT:    v_add_u16_e32 v1, 64, v0
+; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_splat:
@@ -179,9 +179,9 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
 ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_lo:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, 4
-; GFX8-NEXT:    v_subrev_u16_e32 v1, 0xffc0, v0
-; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v2, -4
+; GFX8-NEXT:    v_add_u16_e32 v1, 64, v0
+; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -211,10 +211,10 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
 ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_hi:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_not_b32_e32 v1, 63
-; GFX8-NEXT:    v_subrev_u16_e32 v2, 4, v0
-; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 64
+; GFX8-NEXT:    v_add_u16_e32 v1, -4, v0
+; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_hi:
@@ -245,8 +245,8 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffc0
-; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_add_i32 s0, s0, 0xffff0040
+; GFX8-NEXT:    s_add_i32 s1, s1, 0xffff0040
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
@@ -285,8 +285,8 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffc0
-; GFX8-NEXT:    s_sub_i32 s1, s1, 4
+; GFX8-NEXT:    s_add_i32 s0, s0, 0xffff0040
+; GFX8-NEXT:    s_add_i32 s1, s1, -4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
@@ -325,8 +325,8 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_sub_i32 s0, s0, 4
-; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_add_i32 s0, s0, -4
+; GFX8-NEXT:    s_add_i32 s1, s1, 0xffff0040
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index 31f61b9..24ec4fa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -222,10 +222,10 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_subrev_i32_e32 v1, vcc, 0x12d8fb, v0
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 0xffed2705, v0
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v1, vcc, 0x12d8fb, v0
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 0xffed2705, v0
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -243,23 +243,23 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v3
+; GISEL-NEXT:    v_mul_lo_u32 v5, v3, v4
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v2
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v3, v2
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v0, v2
-; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0xffed2705, v1
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v0, v2
-; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 0xffed2705, v1
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
@@ -274,23 +274,23 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT:    v_mul_lo_u32 v3, v2, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v0, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v2, v3
+; CGP-NEXT:    v_mul_hi_u32 v5, v2, v5
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v0, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CGP-NEXT:    v_mul_lo_u32 v3, v3, v4
+; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; CGP-NEXT:    v_mul_lo_u32 v2, v2, v4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; CGP-NEXT:    v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0
-; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v0, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, 0xffed2705, v1
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v0, v4
-; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v0, v3
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, 0xffed2705, v1
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
new file mode 100644
index 0000000..d4c50cf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
@@ -0,0 +1,27 @@
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s
+
+; CHECK: LLVM ERROR: failed to find free scratch register
+
+declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0
+declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0
+declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0
+
+define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 {
+  call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0)
+  ret void
+}
+
+define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 {
+  call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0)
+  ret void
+}
+
+define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 {
+  call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1)
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index 8766303..d35b5fe 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -28,10 +28,8 @@ declare hidden void @external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg)
 
 declare hidden void @external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg, i32 inreg, i64 inreg) #0
 
-declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0
-declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0
-declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0
-declare hidden void @external_void_func_a15i32_inreg_i32_inreg__noimplicit([15 x i32] inreg, i32 inreg) #1
+declare hidden void @external_void_func_a15i32_inreg([13 x i32] inreg) #0
+declare hidden void @external_void_func_a15i32_inreg_i32_inreg__noimplicit([13 x i32] inreg, i32 inreg) #1
 
 define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i8_inreg:
@@ -534,12 +532,6 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
   ret void
 }
 
-; FIXME:
-; define void @test_call_external_void_func_v16i32_inreg(<16 x i32> inreg %arg) #0 {
-;   call void @external_void_func_v16i32_inreg(<16 x i32> inreg %arg)
-;   ret void
-; }
-
 define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 ; GFX9-LABEL: test_call_external_void_func_f16_inreg:
 ; GFX9:       ; %bb.0:
@@ -1402,16 +1394,16 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
   ret void
 }
 
-define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 {
+define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #0 {
 ; GFX9-LABEL: test_call_external_void_func_a15i32_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s29, s33
+; GFX9-NEXT:    s_mov_b32 s27, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 vcc, -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, vcc
-; GFX9-NEXT:    v_writelane_b32 v40, s29, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX9-NEXT:    v_writelane_b32 v40, s27, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s3, s17
@@ -1427,13 +1419,11 @@ define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #
 ; GFX9-NEXT:    s_mov_b32 s22, s24
 ; GFX9-NEXT:    s_mov_b32 s23, s25
 ; GFX9-NEXT:    s_mov_b32 s24, s26
-; GFX9-NEXT:    s_mov_b32 s25, s27
-; GFX9-NEXT:    s_mov_b32 s26, s28
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 vcc
-; GFX9-NEXT:    s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 vcc_hi, vcc_hi, external_void_func_a15i32_inreg@rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], vcc
+; GFX9-NEXT:    s_getpc_b64 s[28:29]
+; GFX9-NEXT:    s_add_u32 s28, s28, external_void_func_a15i32_inreg@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s29, s29, external_void_func_a15i32_inreg@rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[28:29]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1448,19 +1438,17 @@ define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #
 ; GFX11-LABEL: test_call_external_void_func_a15i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s25, s33
+; GFX11-NEXT:    s_mov_b32 s23, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s26, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s24, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s26
-; GFX11-NEXT:    v_writelane_b32 v40, s25, 2
-; GFX11-NEXT:    s_mov_b32 s26, s24
-; GFX11-NEXT:    s_mov_b32 s25, s23
+; GFX11-NEXT:    s_mov_b32 exec_lo, s24
+; GFX11-NEXT:    v_writelane_b32 v40, s23, 2
 ; GFX11-NEXT:    s_mov_b32 s24, s22
 ; GFX11-NEXT:    s_mov_b32 s23, s21
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_mov_b32 s22, s20
 ; GFX11-NEXT:    s_mov_b32 s21, s19
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_mov_b32 s20, s18
 ; GFX11-NEXT:    s_mov_b32 s19, s17
 ; GFX11-NEXT:    s_mov_b32 s18, s16
@@ -1468,11 +1456,11 @@ define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #
 ; GFX11-NEXT:    s_mov_b32 s16, s6
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_getpc_b64 s[28:29]
-; GFX11-NEXT:    s_add_u32 s28, s28, external_void_func_a15i32_inreg@rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s29, s29, external_void_func_a15i32_inreg@rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[26:27]
+; GFX11-NEXT:    s_add_u32 s26, s26, external_void_func_a15i32_inreg@rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s27, s27, external_void_func_a15i32_inreg@rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[28:29]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[26:27]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
@@ -1483,34 +1471,22 @@ define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0)
+  call void @external_void_func_a15i32_inreg([13 x i32] inreg %arg0)
   ret void
 }
 
-; FIXME:
-; define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 {
-;   call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0)
-;   ret void
-; }
-
-; FIXME:
-; define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 {
-;   call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1)
-;   ret void
-; }
-
 
 ; FIXME: This should also fail
-define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #1 {
+define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inreg %arg0, i32 inreg %arg1) #1 {
 ; GFX9-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s23, s33
+; GFX9-NEXT:    s_mov_b32 s21, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[24:25], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[24:25]
-; GFX9-NEXT:    v_writelane_b32 v40, s23, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-NEXT:    v_writelane_b32 v40, s21, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s3, s7
@@ -1527,13 +1503,11 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inre
 ; GFX9-NEXT:    s_mov_b32 s11, s18
 ; GFX9-NEXT:    s_mov_b32 s15, s19
 ; GFX9-NEXT:    s_mov_b32 s16, s20
-; GFX9-NEXT:    s_mov_b32 s17, s21
-; GFX9-NEXT:    s_mov_b32 s18, s22
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[24:25]
-; GFX9-NEXT:    s_add_u32 s24, s24, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s25, s25, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[24:25]
+; GFX9-NEXT:    s_getpc_b64 s[22:23]
+; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1548,19 +1522,19 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inre
 ; GFX11-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s19, s33
+; GFX11-NEXT:    s_mov_b32 s17, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s20, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s18, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s20
-; GFX11-NEXT:    v_writelane_b32 v40, s19, 2
+; GFX11-NEXT:    s_mov_b32 exec_lo, s18
+; GFX11-NEXT:    v_writelane_b32 v40, s17, 2
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[20:21]
-; GFX11-NEXT:    s_add_u32 s20, s20, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s21, s21, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[18:19]
+; GFX11-NEXT:    s_add_u32 s18, s18, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s19, s19, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -1572,7 +1546,7 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inre
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  call void @external_void_func_a15i32_inreg_i32_inreg__noimplicit([15 x i32] inreg %arg0, i32 inreg %arg1)
+  call void @external_void_func_a15i32_inreg_i32_inreg__noimplicit([13 x i32] inreg %arg0, i32 inreg %arg1)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index a7522ef..c63e9d4 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -567,7 +567,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 0xffffffe8, v1
 ; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
@@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    v_sub_nc_u16 v1, v1, 24
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffe8
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
@@ -1686,7 +1686,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v2, -16, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
@@ -1807,7 +1807,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    v_sub_nc_u16 v1, v1, 25
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffe7
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index d94ec56..147ddc4 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -1313,7 +1313,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_sub_u32_e32 v0, 64, v8
 ; GFX9-G-NEXT:    v_lshrrev_b64 v[0:1], v0, v[10:11]
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[2:3], v8, v[12:13]
-; GFX9-G-NEXT:    v_subrev_u32_e32 v9, 64, v8
+; GFX9-G-NEXT:    v_add_u32_e32 v9, 0xffffffc0, v8
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[6:7], v8, v[10:11]
 ; GFX9-G-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-G-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -1338,7 +1338,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_sub_u32_e32 v2, 64, v20
 ; GFX9-G-NEXT:    v_lshrrev_b64 v[0:1], v20, v[10:11]
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[2:3], v2, v[12:13]
-; GFX9-G-NEXT:    v_subrev_u32_e32 v24, 64, v20
+; GFX9-G-NEXT:    v_add_u32_e32 v24, 0xffffffc0, v20
 ; GFX9-G-NEXT:    v_lshrrev_b64 v[14:15], v20, v[12:13]
 ; GFX9-G-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-G-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -2070,8 +2070,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v4, v18, v4
+; GFX9-G-O0-NEXT:    s_mov_b32 s5, 0xffffffc0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v4, v18, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX9-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v18
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
@@ -2203,8 +2204,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v9
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v2, v8, v0
+; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0xffffffc0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v2, v8, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX9-G-O0-NEXT:    v_sub_u32_e64 v14, v0, v8
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
@@ -3453,7 +3455,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_sub_u32_e32 v8, 64, v16
 ; GFX9-G-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[10:11], v16, v[2:3]
-; GFX9-G-NEXT:    v_subrev_u32_e32 v14, 64, v16
+; GFX9-G-NEXT:    v_add_u32_e32 v14, 0xffffffc0, v16
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[12:13], v16, v[0:1]
 ; GFX9-G-NEXT:    v_or_b32_e32 v10, v8, v10
 ; GFX9-G-NEXT:    v_or_b32_e32 v11, v9, v11
@@ -3476,7 +3478,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    s_cbranch_execz .LBB1_5
 ; GFX9-G-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GFX9-G-NEXT:    v_sub_u32_e32 v12, 64, v18
-; GFX9-G-NEXT:    v_subrev_u32_e32 v22, 64, v18
+; GFX9-G-NEXT:    v_add_u32_e32 v22, 0xffffffc0, v18
 ; GFX9-G-NEXT:    v_lshrrev_b64 v[10:11], v18, v[0:1]
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[12:13], v12, v[2:3]
 ; GFX9-G-NEXT:    v_lshrrev_b64 v[16:17], v18, v[2:3]
@@ -4175,8 +4177,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v7
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v4, v12, v4
+; GFX9-G-O0-NEXT:    s_mov_b32 s5, 0xffffffc0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v4, v12, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX9-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v12
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
@@ -4311,8 +4314,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, 64
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v2, v3, v0
+; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0xffffffc0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v2, v3, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX9-G-O0-NEXT:    v_sub_u32_e64 v8, v0, v3
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index e04cd71..691f3d3 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -476,18 +476,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[20:21]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[20:21]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v8, 0x7f, v0
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_xor_b32_e32 v8, 0x7f, v2
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v8, v8, v2
-; GISEL-NEXT:    v_or_b32_e32 v9, v1, v3
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_or_b32_e32 v8, v8, v0
+; GISEL-NEXT:    v_or_b32_e32 v9, v3, v1
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
@@ -505,12 +505,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB0_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
-; GISEL-NEXT:    v_add_i32_e32 v28, vcc, 1, v0
-; GISEL-NEXT:    v_addc_u32_e64 v29, s[4:5], 0, v1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v32, vcc, 0x7f, v0
-; GISEL-NEXT:    v_addc_u32_e64 v30, vcc, 0, v2, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v31, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_subrev_i32_e64 v20, s[4:5], 64, v32
+; GISEL-NEXT:    v_add_i32_e32 v28, vcc, 1, v2
+; GISEL-NEXT:    v_addc_u32_e64 v29, s[4:5], 0, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v32, vcc, 0x7f, v2
+; GISEL-NEXT:    v_not_b32_e32 v2, 63
+; GISEL-NEXT:    v_addc_u32_e64 v30, vcc, 0, v0, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v31, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_add_i32_e64 v20, s[4:5], v32, v2
 ; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], 64, v32
 ; GISEL-NEXT:    v_lshl_b64 v[0:1], v[16:17], v32
 ; GISEL-NEXT:    v_lshl_b64 v[2:3], v[18:19], v32
@@ -536,7 +537,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB0_5
 ; GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
-; GISEL-NEXT:    v_subrev_i32_e32 v34, vcc, 64, v28
+; GISEL-NEXT:    v_add_i32_e32 v34, vcc, 0xffffffc0, v28
 ; GISEL-NEXT:    v_sub_i32_e32 v22, vcc, 64, v28
 ; GISEL-NEXT:    v_lshr_b64 v[0:1], v[18:19], v28
 ; GISEL-NEXT:    v_lshr_b64 v[2:3], v[16:17], v28
@@ -665,18 +666,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, 0x7f, v0
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_xor_b32_e32 v10, 0x7f, v2
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v10, v10, v2
-; GISEL-NEXT:    v_or_b32_e32 v11, v1, v3
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_or_b32_e32 v10, v10, v0
+; GISEL-NEXT:    v_or_b32_e32 v11, v3, v1
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
@@ -694,12 +695,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB0_12
 ; GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
-; GISEL-NEXT:    v_add_i32_e32 v26, vcc, 1, v0
-; GISEL-NEXT:    v_addc_u32_e64 v27, s[4:5], 0, v1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v30, vcc, 0x7f, v0
-; GISEL-NEXT:    v_addc_u32_e64 v28, vcc, 0, v2, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_subrev_i32_e64 v14, s[4:5], 64, v30
+; GISEL-NEXT:    v_add_i32_e32 v26, vcc, 1, v2
+; GISEL-NEXT:    v_addc_u32_e64 v27, s[4:5], 0, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v30, vcc, 0x7f, v2
+; GISEL-NEXT:    v_not_b32_e32 v2, 63
+; GISEL-NEXT:    v_addc_u32_e64 v28, vcc, 0, v0, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v30, v2
 ; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], 64, v30
 ; GISEL-NEXT:    v_lshl_b64 v[0:1], v[6:7], v30
 ; GISEL-NEXT:    v_lshl_b64 v[2:3], v[12:13], v30
@@ -725,7 +727,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB0_11
 ; GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
-; GISEL-NEXT:    v_subrev_i32_e32 v32, vcc, 64, v26
+; GISEL-NEXT:    v_add_i32_e32 v32, vcc, 0xffffffc0, v26
 ; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 64, v26
 ; GISEL-NEXT:    v_lshr_b64 v[0:1], v[12:13], v26
 ; GISEL-NEXT:    v_lshr_b64 v[2:3], v[6:7], v26
@@ -1229,18 +1231,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v20, vcc, v2, v3
-; GISEL-NEXT:    v_subb_u32_e64 v21, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[20:21], v[24:25]
+; GISEL-NEXT:    v_sub_i32_e32 v22, vcc, v2, v3
+; GISEL-NEXT:    v_subb_u32_e64 v23, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v20, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v21, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, 0x7f, v20
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT:    v_xor_b32_e32 v2, 0x7f, v22
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[20:21]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v2, v2, v22
-; GISEL-NEXT:    v_or_b32_e32 v3, v21, v23
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v20
+; GISEL-NEXT:    v_or_b32_e32 v3, v23, v21
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -1258,12 +1260,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB1_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
-; GISEL-NEXT:    v_add_i32_e32 v26, vcc, 1, v20
-; GISEL-NEXT:    v_addc_u32_e64 v27, s[4:5], 0, v21, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v30, vcc, 0x7f, v20
-; GISEL-NEXT:    v_addc_u32_e64 v28, vcc, 0, v22, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v23, vcc
-; GISEL-NEXT:    v_subrev_i32_e64 v22, s[4:5], 64, v30
+; GISEL-NEXT:    v_add_i32_e32 v26, vcc, 1, v22
+; GISEL-NEXT:    v_addc_u32_e64 v27, s[4:5], 0, v23, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v30, vcc, 0x7f, v22
+; GISEL-NEXT:    v_not_b32_e32 v2, 63
+; GISEL-NEXT:    v_addc_u32_e64 v28, vcc, 0, v20, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v21, vcc
+; GISEL-NEXT:    v_add_i32_e64 v22, s[4:5], v30, v2
 ; GISEL-NEXT:    v_sub_i32_e64 v20, s[4:5], 64, v30
 ; GISEL-NEXT:    v_lshl_b64 v[2:3], v[0:1], v30
 ; GISEL-NEXT:    v_lshl_b64 v[18:19], v[16:17], v30
@@ -1289,7 +1292,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB1_5
 ; GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
-; GISEL-NEXT:    v_subrev_i32_e32 v32, vcc, 64, v26
+; GISEL-NEXT:    v_add_i32_e32 v32, vcc, 0xffffffc0, v26
 ; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, 64, v26
 ; GISEL-NEXT:    v_lshr_b64 v[18:19], v[16:17], v26
 ; GISEL-NEXT:    v_lshr_b64 v[20:21], v[0:1], v26
@@ -1401,18 +1404,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11]
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[16:17], v[10:11]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v8, 0x7f, v0
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_xor_b32_e32 v8, 0x7f, v16
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v8, v8, v16
-; GISEL-NEXT:    v_or_b32_e32 v9, v1, v17
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_or_b32_e32 v8, v8, v0
+; GISEL-NEXT:    v_or_b32_e32 v9, v17, v1
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
@@ -1430,12 +1433,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB1_12
 ; GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
-; GISEL-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, v1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v26, vcc, 0x7f, v0
-; GISEL-NEXT:    v_addc_u32_e64 v24, vcc, 0, v16, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v25, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_subrev_i32_e64 v9, s[4:5], 64, v26
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v16
+; GISEL-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, v17, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v26, vcc, 0x7f, v16
+; GISEL-NEXT:    v_not_b32_e32 v9, 63
+; GISEL-NEXT:    v_addc_u32_e64 v24, vcc, 0, v0, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v25, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v26, v9
 ; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], 64, v26
 ; GISEL-NEXT:    v_lshl_b64 v[0:1], v[4:5], v26
 ; GISEL-NEXT:    v_lshl_b64 v[16:17], v[6:7], v26
@@ -1461,7 +1465,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB1_11
 ; GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
-; GISEL-NEXT:    v_subrev_i32_e32 v28, vcc, 64, v8
+; GISEL-NEXT:    v_add_i32_e32 v28, vcc, 0xffffffc0, v8
 ; GISEL-NEXT:    v_sub_i32_e32 v22, vcc, 64, v8
 ; GISEL-NEXT:    v_lshr_b64 v[16:17], v[6:7], v8
 ; GISEL-NEXT:    v_lshr_b64 v[20:21], v[4:5], v8
@@ -2072,18 +2076,18 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[18:19]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, 0x7f, v0
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_xor_b32_e32 v18, 0x7f, v2
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v18, v18, v2
-; GISEL-NEXT:    v_or_b32_e32 v19, v1, v3
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_or_b32_e32 v18, v18, v0
+; GISEL-NEXT:    v_or_b32_e32 v19, v3, v1
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
@@ -2101,12 +2105,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
-; GISEL-NEXT:    v_add_i32_e32 v31, vcc, 1, v0
-; GISEL-NEXT:    v_addc_u32_e64 v32, s[4:5], 0, v1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, 0x7f, v0
-; GISEL-NEXT:    v_addc_u32_e64 v33, vcc, 0, v2, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v34, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_subrev_i32_e64 v20, s[4:5], 64, v24
+; GISEL-NEXT:    v_add_i32_e32 v31, vcc, 1, v2
+; GISEL-NEXT:    v_addc_u32_e64 v32, s[4:5], 0, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, 0x7f, v2
+; GISEL-NEXT:    v_not_b32_e32 v2, 63
+; GISEL-NEXT:    v_addc_u32_e64 v33, vcc, 0, v0, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v34, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_add_i32_e64 v20, s[4:5], v24, v2
 ; GISEL-NEXT:    v_sub_i32_e64 v18, s[4:5], 64, v24
 ; GISEL-NEXT:    v_lshl_b64 v[0:1], v[16:17], v24
 ; GISEL-NEXT:    v_lshl_b64 v[2:3], v[8:9], v24
@@ -2132,7 +2137,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_5
 ; GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
-; GISEL-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v31
+; GISEL-NEXT:    v_add_i32_e32 v24, vcc, 0xffffffc0, v31
 ; GISEL-NEXT:    v_sub_i32_e32 v22, vcc, 64, v31
 ; GISEL-NEXT:    v_lshr_b64 v[0:1], v[8:9], v31
 ; GISEL-NEXT:    v_lshr_b64 v[2:3], v[16:17], v31
@@ -2262,18 +2267,18 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v15, v14, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[14:15], v[2:3]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, 0x7f, v0
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[14:15]
+; GISEL-NEXT:    v_xor_b32_e32 v2, 0x7f, v14
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v2, v2, v14
-; GISEL-NEXT:    v_or_b32_e32 v3, v1, v15
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v0
+; GISEL-NEXT:    v_or_b32_e32 v3, v15, v1
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -2291,12 +2296,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_12
 ; GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
-; GISEL-NEXT:    v_add_i32_e32 v36, vcc, 1, v0
-; GISEL-NEXT:    v_addc_u32_e64 v37, s[4:5], 0, v1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, 0x7f, v0
-; GISEL-NEXT:    v_addc_u32_e64 v38, vcc, 0, v14, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v39, vcc, 0, v15, vcc
-; GISEL-NEXT:    v_subrev_i32_e64 v20, s[4:5], 64, v24
+; GISEL-NEXT:    v_add_i32_e32 v36, vcc, 1, v14
+; GISEL-NEXT:    v_addc_u32_e64 v37, s[4:5], 0, v15, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, 0x7f, v14
+; GISEL-NEXT:    v_not_b32_e32 v2, 63
+; GISEL-NEXT:    v_addc_u32_e64 v38, vcc, 0, v0, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v39, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_add_i32_e64 v20, s[4:5], v24, v2
 ; GISEL-NEXT:    v_sub_i32_e64 v14, s[4:5], 64, v24
 ; GISEL-NEXT:    v_lshl_b64 v[0:1], v[12:13], v24
 ; GISEL-NEXT:    v_lshl_b64 v[2:3], v[6:7], v24
@@ -2322,7 +2328,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_11
 ; GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
-; GISEL-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v36
+; GISEL-NEXT:    v_add_i32_e32 v24, vcc, 0xffffffc0, v36
 ; GISEL-NEXT:    v_sub_i32_e32 v22, vcc, 64, v36
 ; GISEL-NEXT:    v_lshr_b64 v[0:1], v[6:7], v36
 ; GISEL-NEXT:    v_lshr_b64 v[2:3], v[12:13], v36
@@ -2903,18 +2909,18 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v17, v19, v18, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v16, v17
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v18, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e64 v19, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[16:17], v[20:21]
+; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v16, v17
+; GISEL-NEXT:    v_subb_u32_e64 v19, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[18:19], v[20:21]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v20, 0x7f, v16
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT:    v_xor_b32_e32 v20, 0x7f, v18
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[16:17]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v20, v20, v18
-; GISEL-NEXT:    v_or_b32_e32 v21, v17, v19
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT:    v_or_b32_e32 v20, v20, v16
+; GISEL-NEXT:    v_or_b32_e32 v21, v19, v17
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
@@ -2932,12 +2938,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB3_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
-; GISEL-NEXT:    v_add_i32_e32 v30, vcc, 1, v16
-; GISEL-NEXT:    v_addc_u32_e64 v31, s[4:5], 0, v17, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v26, vcc, 0x7f, v16
-; GISEL-NEXT:    v_addc_u32_e64 v32, vcc, 0, v18, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v33, vcc, 0, v19, vcc
-; GISEL-NEXT:    v_subrev_i32_e64 v22, s[4:5], 64, v26
+; GISEL-NEXT:    v_add_i32_e32 v30, vcc, 1, v18
+; GISEL-NEXT:    v_addc_u32_e64 v31, s[4:5], 0, v19, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v26, vcc, 0x7f, v18
+; GISEL-NEXT:    v_not_b32_e32 v18, 63
+; GISEL-NEXT:    v_addc_u32_e64 v32, vcc, 0, v16, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v33, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_add_i32_e64 v22, s[4:5], v26, v18
 ; GISEL-NEXT:    v_sub_i32_e64 v20, s[4:5], 64, v26
 ; GISEL-NEXT:    v_lshl_b64 v[16:17], v[0:1], v26
 ; GISEL-NEXT:    v_lshl_b64 v[18:19], v[2:3], v26
@@ -2963,7 +2970,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB3_5
 ; GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
-; GISEL-NEXT:    v_subrev_i32_e32 v26, vcc, 64, v30
+; GISEL-NEXT:    v_add_i32_e32 v26, vcc, 0xffffffc0, v30
 ; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, 64, v30
 ; GISEL-NEXT:    v_lshr_b64 v[16:17], v[2:3], v30
 ; GISEL-NEXT:    v_lshr_b64 v[18:19], v[0:1], v30
@@ -3075,18 +3082,18 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v17, v19, v18, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v16, v17
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[16:17], v[24:25]
+; GISEL-NEXT:    v_sub_i32_e32 v22, vcc, v16, v17
+; GISEL-NEXT:    v_subb_u32_e64 v23, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, 0x7f, v16
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT:    v_xor_b32_e32 v18, 0x7f, v22
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[16:17]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v18, v18, v22
-; GISEL-NEXT:    v_or_b32_e32 v19, v17, v23
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT:    v_or_b32_e32 v18, v18, v16
+; GISEL-NEXT:    v_or_b32_e32 v19, v23, v17
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
@@ -3104,12 +3111,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB3_12
 ; GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
-; GISEL-NEXT:    v_add_i32_e32 v34, vcc, 1, v16
-; GISEL-NEXT:    v_addc_u32_e64 v35, s[4:5], 0, v17, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v28, vcc, 0x7f, v16
-; GISEL-NEXT:    v_addc_u32_e64 v36, vcc, 0, v22, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v37, vcc, 0, v23, vcc
-; GISEL-NEXT:    v_subrev_i32_e64 v24, s[4:5], 64, v28
+; GISEL-NEXT:    v_add_i32_e32 v34, vcc, 1, v22
+; GISEL-NEXT:    v_addc_u32_e64 v35, s[4:5], 0, v23, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v28, vcc, 0x7f, v22
+; GISEL-NEXT:    v_not_b32_e32 v18, 63
+; GISEL-NEXT:    v_addc_u32_e64 v36, vcc, 0, v16, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v37, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_add_i32_e64 v24, s[4:5], v28, v18
 ; GISEL-NEXT:    v_sub_i32_e64 v22, s[4:5], 64, v28
 ; GISEL-NEXT:    v_lshl_b64 v[16:17], v[4:5], v28
 ; GISEL-NEXT:    v_lshl_b64 v[18:19], v[6:7], v28
@@ -3135,7 +3143,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB3_11
 ; GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
-; GISEL-NEXT:    v_subrev_i32_e32 v28, vcc, 64, v34
+; GISEL-NEXT:    v_add_i32_e32 v28, vcc, 0xffffffc0, v34
 ; GISEL-NEXT:    v_sub_i32_e32 v26, vcc, 64, v34
 ; GISEL-NEXT:    v_lshr_b64 v[16:17], v[6:7], v34
 ; GISEL-NEXT:    v_lshr_b64 v[18:19], v[4:5], v34
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 3de8db2..0e5a687 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -163,6 +163,31 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
   ret i32 %masked
 }
 
+define i32 @bzhi32_d0_even(i32 %val, i32 %numlowbits) nounwind {
+; SI-LABEL: bzhi32_d0_even:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bzhi32_d0_even:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %times2 = shl i32 %numlowbits, 1
+  %numhighbits = sub i32 32, %times2
+  %highbitscleared = shl i32 %val, %numhighbits
+  %masked = lshr i32 %highbitscleared, %numhighbits
+  ret i32 %masked
+}
+
 define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; SI-LABEL: bzhi32_d1_indexzext:
 ; SI:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 05a4c2a..a0fe9d8 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -234,17 +234,17 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB0_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffbcd, v6
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
-; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffb8d, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
@@ -602,17 +602,17 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB1_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffbcd, v6
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
-; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffb8d, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
@@ -963,17 +963,17 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff6a, v6
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
-; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
@@ -1318,17 +1318,17 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB3_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff6a, v6
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
-; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
@@ -1699,7 +1699,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff7a, v5
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[7:8]
-; GISEL-NEXT:    v_subrev_u32_e32 v4, 64, v6
+; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffff3a, v5
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GISEL-NEXT:    v_lshl_or_b32 v11, v11, 16, v11
@@ -2048,7 +2048,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff7a, v5
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[7:8]
-; GISEL-NEXT:    v_subrev_u32_e32 v4, 64, v6
+; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffff3a, v5
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GISEL-NEXT:    v_lshl_or_b32 v11, v11, 16, v11
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 2999ddb..f372a54 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -193,32 +193,32 @@ define float @sitofp_i128_to_f32(i128 %x) {
 ; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v4
 ; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v4, v[0:1]
 ; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, v[2:3]
-; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v13, 0xffffffc0, v4
 ; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GISEL-NEXT:    v_or_b32_e32 v12, v10, v12
 ; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v13, v[2:3]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
-; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_add_u32_e32 v14, 26, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v5
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v14
 ; GISEL-NEXT:    v_cndmask_b32_e32 v13, v9, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v10, v1, vcc
-; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v5, -1
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v14, -1
 ; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, -1
-; GISEL-NEXT:    v_subrev_u32_e32 v14, 64, v5
+; GISEL-NEXT:    v_add_u32_e32 v5, 0xffffffda, v5
 ; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
 ; GISEL-NEXT:    v_or_b32_e32 v16, v10, v12
-; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v14, -1
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v5, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v11, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v12, v16, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v11, -1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v12, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, -1, s[4:5]
 ; GISEL-NEXT:    v_and_b32_e32 v2, v9, v2
 ; GISEL-NEXT:    v_and_b32_e32 v3, v10, v3
 ; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
@@ -438,32 +438,32 @@ define float @uitofp_i128_to_f32(i128 %x) {
 ; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v4
 ; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v4, v[0:1]
 ; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GISEL-NEXT:    v_subrev_u32_e32 v12, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v12, 0xffffffc0, v4
 ; GISEL-NEXT:    v_or_b32_e32 v10, v8, v10
 ; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v12, v[2:3]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
-; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_add_u32_e32 v13, 26, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v5
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v13
 ; GISEL-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v1, vcc
-; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v5, -1
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v13, -1
 ; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, -1
-; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v5
+; GISEL-NEXT:    v_add_u32_e32 v5, 0xffffffda, v5
 ; GISEL-NEXT:    v_or_b32_e32 v14, v8, v10
 ; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
-; GISEL-NEXT:    v_lshrrev_b64 v[10:11], v13, -1
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_lshrrev_b64 v[10:11], v5, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v10, v14, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v11, v15, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
 ; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v8, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, -1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v11, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, -1, s[4:5]
 ; GISEL-NEXT:    v_and_b32_e32 v2, v8, v2
 ; GISEL-NEXT:    v_and_b32_e32 v3, v9, v3
 ; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
@@ -723,34 +723,34 @@ define double @sitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v14
 ; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v14, v[2:3]
 ; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[4:5]
-; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GISEL-NEXT:    v_add_u32_e32 v15, 0xffffffc0, v14
 ; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v14, v[4:5]
 ; GISEL-NEXT:    v_or_b32_e32 v10, v0, v10
 ; GISEL-NEXT:    v_or_b32_e32 v11, v1, v11
 ; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v15, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GISEL-NEXT:    v_add_u32_e32 v9, 55, v9
+; GISEL-NEXT:    v_add_u32_e32 v15, 55, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc
-; GISEL-NEXT:    v_sub_u32_e32 v12, 64, v9
+; GISEL-NEXT:    v_sub_u32_e32 v12, 64, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, v0, v2, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v1, v3, s[4:5]
-; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v9, -1
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v15, -1
 ; GISEL-NEXT:    v_lshlrev_b64 v[12:13], v12, -1
-; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v9
+; GISEL-NEXT:    v_add_u32_e32 v9, -9, v9
 ; GISEL-NEXT:    v_or_b32_e32 v16, v0, v12
 ; GISEL-NEXT:    v_or_b32_e32 v17, v1, v13
-; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v15, -1
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v9, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v12, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v13, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v12, -1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v13, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v12, -1, s[4:5]
 ; GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
 ; GISEL-NEXT:    v_and_or_b32 v0, v9, v2, v0
@@ -999,35 +999,35 @@ define double @uitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:    v_sub_u32_e32 v9, 64, v13
 ; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v13, v[0:1]
 ; GISEL-NEXT:    v_lshlrev_b64 v[9:10], v9, v[2:3]
-; GISEL-NEXT:    v_subrev_u32_e32 v14, 64, v13
+; GISEL-NEXT:    v_add_u32_e32 v14, 0xffffffc0, v13
 ; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v13, v[2:3]
 ; GISEL-NEXT:    v_or_b32_e32 v9, v4, v9
 ; GISEL-NEXT:    v_or_b32_e32 v10, v5, v10
 ; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v14, v[2:3]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
-; GISEL-NEXT:    v_add_u32_e32 v8, 55, v8
+; GISEL-NEXT:    v_add_u32_e32 v15, 55, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v11, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc
-; GISEL-NEXT:    v_sub_u32_e32 v12, 64, v8
+; GISEL-NEXT:    v_sub_u32_e32 v12, 64, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, v4, v0, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, v5, v1, s[4:5]
-; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v8, -1
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v15, -1
 ; GISEL-NEXT:    v_lshlrev_b64 v[12:13], v12, -1
-; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v8
+; GISEL-NEXT:    v_add_u32_e32 v8, -9, v8
 ; GISEL-NEXT:    v_or_b32_e32 v16, v4, v12
 ; GISEL-NEXT:    v_or_b32_e32 v17, v5, v13
-; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v15, -1
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v8, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v12, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v13, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v12, -1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v13, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v8, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v12, -1, s[4:5]
 ; GISEL-NEXT:    v_and_b32_e32 v2, v4, v2
 ; GISEL-NEXT:    v_and_b32_e32 v3, v5, v3
 ; GISEL-NEXT:    v_and_or_b32 v0, v8, v0, v2
@@ -1284,32 +1284,32 @@ define half @sitofp_i128_to_f16(i128 %x) {
 ; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v4
 ; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v4, v[0:1]
 ; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, v[2:3]
-; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v13, 0xffffffc0, v4
 ; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GISEL-NEXT:    v_or_b32_e32 v12, v10, v12
 ; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v13, v[2:3]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
-; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_add_u32_e32 v14, 26, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v5
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v14
 ; GISEL-NEXT:    v_cndmask_b32_e32 v13, v9, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v10, v1, vcc
-; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v5, -1
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v14, -1
 ; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, -1
-; GISEL-NEXT:    v_subrev_u32_e32 v14, 64, v5
+; GISEL-NEXT:    v_add_u32_e32 v5, 0xffffffda, v5
 ; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
 ; GISEL-NEXT:    v_or_b32_e32 v16, v10, v12
-; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v14, -1
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v5, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v11, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v12, v16, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v11, -1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v12, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, -1, s[4:5]
 ; GISEL-NEXT:    v_and_b32_e32 v2, v9, v2
 ; GISEL-NEXT:    v_and_b32_e32 v3, v10, v3
 ; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
@@ -1531,32 +1531,32 @@ define half @uitofp_i128_to_f16(i128 %x) {
 ; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v4
 ; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v4, v[0:1]
 ; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GISEL-NEXT:    v_subrev_u32_e32 v12, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v12, 0xffffffc0, v4
 ; GISEL-NEXT:    v_or_b32_e32 v10, v8, v10
 ; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v12, v[2:3]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
-; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_add_u32_e32 v13, 26, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v5
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v13
 ; GISEL-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v1, vcc
-; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v5, -1
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v13, -1
 ; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, -1
-; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v5
+; GISEL-NEXT:    v_add_u32_e32 v5, 0xffffffda, v5
 ; GISEL-NEXT:    v_or_b32_e32 v14, v8, v10
 ; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
-; GISEL-NEXT:    v_lshrrev_b64 v[10:11], v13, -1
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_lshrrev_b64 v[10:11], v5, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v10, v14, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v11, v15, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
 ; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v8, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, -1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v11, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, -1, s[4:5]
 ; GISEL-NEXT:    v_and_b32_e32 v2, v8, v2
 ; GISEL-NEXT:    v_and_b32_e32 v3, v9, v3
 ; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 8d99ec2e..b2bfc2e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -139,7 +139,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT6-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
 ; VARIANT6-NEXT:    s_wait_kmcnt 0x0
 ; VARIANT6-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_and_b32 v4, 0x3ff, v0
-; VARIANT6-NEXT:    s_sub_co_i32 s2, s2, 1
+; VARIANT6-NEXT:    s_add_co_i32 s2, s2, -1
 ; VARIANT6-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; VARIANT6-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_lshlrev_b32 v5, 2, v4
 ; VARIANT6-NEXT:    v_sub_nc_u32_e32 v0, s2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index a577fb3..d874418 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -395,7 +395,7 @@ define i1 @posnormal_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v1
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v2
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v1
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -459,7 +459,7 @@ define i1 @negnormal_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v1
 ; GFX7GLISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], v0, v2
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v1
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -513,7 +513,7 @@ define i1 @possubnormal_f16(half %x) nounwind {
 ; GFX7GLISEL-LABEL: possubnormal_f16:
 ; GFX7GLISEL:       ; %bb.0:
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -575,7 +575,7 @@ define i1 @negsubnormal_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v1
 ; GFX7GLISEL-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v2
-; GFX7GLISEL-NEXT:    v_subrev_i32_e64 v0, s[4:5], 1, v1
+; GFX7GLISEL-NEXT:    v_add_i32_e64 v0, s[4:5], -1, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
@@ -1587,7 +1587,7 @@ define i1 @not_issubnormal_or_zero_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v0
 ; GFX7GLISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v2
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
@@ -1647,7 +1647,7 @@ define i1 @isnormal_f16(half %x) {
 ; GFX7GLISEL:       ; %bb.0:
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -1780,7 +1780,7 @@ define i1 @not_is_plus_normal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v3, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v1
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -1853,7 +1853,7 @@ define i1 @not_is_neg_normal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v3, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v1
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -1911,7 +1911,7 @@ define i1 @issubnormal_f16(half %x) {
 ; GFX7GLISEL:       ; %bb.0:
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -1974,7 +1974,7 @@ define i1 @not_issubnormal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -2081,7 +2081,7 @@ define i1 @not_iszero_f16(half %x) {
 ; GFX7GLISEL:       ; %bb.0:
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v1, vcc, -1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
@@ -2091,7 +2091,7 @@ define i1 @not_iszero_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -2568,7 +2568,7 @@ define i1 @not_iszero_or_nan_f16(half %x) {
 ; GFX7GLISEL:       ; %bb.0: ; %entry
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v1, vcc, -1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
@@ -2576,7 +2576,7 @@ define i1 @not_iszero_or_nan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -2635,7 +2635,7 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7GLISEL:       ; %bb.0: ; %entry
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v1, vcc, -1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
@@ -2643,7 +2643,7 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -2702,7 +2702,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7GLISEL:       ; %bb.0: ; %entry
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v1, vcc, -1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
@@ -2710,7 +2710,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -2902,7 +2902,7 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
 ; GFX7GLISEL:       ; %bb.0: ; %entry
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v1, vcc, -1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
@@ -2914,7 +2914,7 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x1ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
@@ -2983,7 +2983,7 @@ define i1 @not_iszero_or_snan_f16(half %x) {
 ; GFX7GLISEL:       ; %bb.0: ; %entry
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v1, vcc, -1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
@@ -2994,7 +2994,7 @@ define i1 @not_iszero_or_snan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
+; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 6ac04d8..b3c0675 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
@@ -78,44 +78,79 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
+; VI-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffc0, v3
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: v_test_i32_x_sub_64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_subrev_u32_e32 v1, 64, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    s_endpgm
+; GFX9-SDAG-LABEL: v_test_i32_x_sub_64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_subrev_u32_e32 v1, 64, v1
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: v_test_i32_x_sub_64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX9-GISEL-LABEL: v_test_i32_x_sub_64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 0xffffffc0, v1
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_i32_x_sub_64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_test_i32_x_sub_64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_i32_x_sub_64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 0xffffffc0, v1
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_i32_x_sub_64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_i32_x_sub_64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 0xffffffc0, v1
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
@@ -163,8 +198,8 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffc0, v3
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
@@ -211,66 +246,119 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
-; VI-GISEL-NEXT:    v_subrev_u32_e32 v3, vcc, 64, v4
+; VI-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffc0, v3
+; VI-GISEL-NEXT:    v_add_u32_e32 v3, vcc, 0xffffffc0, v4
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v3
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_subrev_u32_e32 v1, 64, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 64, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_endpgm
+; GFX9-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_subrev_u32_e32 v1, 64, v1
+; GFX9-SDAG-NEXT:    v_subrev_u32_e32 v2, 64, v2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_store_dword v0, v2, s[4:5]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_endpgm
+; GFX9-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 0xffffffc0, v1
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v2, 0xffffffc0, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_i32_x_sub_64_multi_use:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b32 v2, v0, s[2:3] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
+; GFX10-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 0xffffffc0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v2, 0xffffffc0, v2
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v2, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
+; GFX11-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v2, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 0xffffffc0, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v2, 0xffffffc0, v2
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
@@ -425,7 +513,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0x41, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffbf, v2
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
@@ -462,79 +550,44 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_subrev_u32_e32 v2, vcc, 0x41, v3
+; VI-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffbf, v3
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_test_i32_x_sub_65:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 0xffffffbf, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_i32_x_sub_65:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 0x41, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_i32_x_sub_65:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_i32_x_sub_65:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0x41, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_i32_x_sub_65:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0xffffffbf, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_i32_x_sub_65:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_i32_x_sub_65:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_i32_x_sub_65:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0x41, v1
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_i32_x_sub_65:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
@@ -686,7 +739,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, -16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 16, v2
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
@@ -723,79 +776,44 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_subrev_u32_e32 v2, vcc, -16, v3
+; VI-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 16, v3
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, -16, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, -16, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_i32_x_sub_neg16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 16, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_i32_x_sub_neg16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 16, v1
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, -16, v1
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_i32_x_sub_neg16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 16, v1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
@@ -947,7 +965,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0xffffffef, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 17, v2
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
@@ -984,79 +1002,44 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_subrev_u32_e32 v2, vcc, 0xffffffef, v3
+; VI-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 17, v3
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 17, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 0xffffffef, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 17, v1
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0xffffffef, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_i32_x_sub_neg17:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 17, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v1, 17, v1
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_i32_x_sub_neg17:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 17, v1
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0xffffffef, v1
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_i32_x_sub_neg17:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 17, v1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
@@ -1263,7 +1246,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
@@ -1300,44 +1283,79 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 64, v3
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v3
 ; VI-GISEL-NEXT:    flat_store_short v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: v_test_i16_x_sub_64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
-; GFX9-NEXT:    s_endpgm
+; GFX9-SDAG-LABEL: v_test_i16_x_sub_64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_subrev_u16_e32 v1, 64, v1
+; GFX9-SDAG-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: v_test_i16_x_sub_64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_sub_nc_u16 v1, v1, 64
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX9-GISEL-LABEL: v_test_i16_x_sub_64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, 0xffc0, v1
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_i16_x_sub_64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u16 v1, v1, 64
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_test_i16_x_sub_64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_sub_nc_u16 v1, v1, 64
+; GFX10-SDAG-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_i16_x_sub_64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_i16_x_sub_64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_sub_nc_u16 v1, v1, 64
+; GFX11-SDAG-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_i16_x_sub_64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
+; GFX11-GISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
@@ -1379,7 +1397,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; SI-GISEL-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 64, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
@@ -1419,50 +1437,91 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 64, v2
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v2
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v1, s[6:7]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    s_endpgm
+; GFX9-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_ushort v1, v1, s[6:7]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_subrev_u16_e32 v1, 64, v1
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v1, s[6:7]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_sub_nc_u16 v1, v1, 64
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX9-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ushort v1, v1, s[6:7]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, 0xffc0, v1
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v1, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u16 v1, v1, 64
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_ushort v1, v1, s[6:7]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_sub_nc_u16 v1, v1, 64
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v1, s[6:7]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_u16 v1, v1, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_sub_nc_u16 v1, v1, 64
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u16 v1, v1, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
@@ -1511,8 +1570,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; SI-GISEL-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffc0, v3
 ; SI-GISEL-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    buffer_store_short v3, v[0:1], s[0:3], 0 addr64
@@ -1559,66 +1618,119 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 64, v3
-; VI-GISEL-NEXT:    v_subrev_u16_e32 v3, 64, v4
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v3
+; VI-GISEL-NEXT:    v_add_u16_e32 v3, 0xffc0, v4
 ; VI-GISEL-NEXT:    flat_store_short v[0:1], v2
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    flat_store_short v[0:1], v3
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT:    v_subrev_u16_e32 v2, 64, v2
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v2, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_endpgm
+; GFX9-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_ushort v2, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_subrev_u16_e32 v1, 64, v1
+; GFX9-SDAG-NEXT:    v_subrev_u16_e32 v2, 64, v2
+; GFX9-SDAG-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    global_store_short v0, v2, s[4:5]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_sub_nc_u16 v1, v1, 64
-; GFX10-NEXT:    v_sub_nc_u16 v2, v2, 64
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_store_short v0, v2, s[4:5]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_endpgm
+; GFX9-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ushort v2, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, 0xffc0, v1
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v2
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    global_store_short v0, v2, s[4:5]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_i16_x_sub_64_multi_use:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u16 v1, v1, 64
-; GFX11-NEXT:    v_sub_nc_u16 v2, v2, 64
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b16 v0, v2, s[0:1] dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_sub_nc_u16 v1, v1, 64
+; GFX10-SDAG-NEXT:    v_sub_nc_u16 v2, v2, 64
+; GFX10-SDAG-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    global_store_short v0, v2, s[4:5]
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v2, 0xffc0
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    global_store_short v0, v2, s[4:5]
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_sub_nc_u16 v1, v1, 64
+; GFX11-SDAG-NEXT:    v_sub_nc_u16 v2, v2, 64
+; GFX11-SDAG-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    global_store_b16 v0, v2, s[0:1] dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v2, v2, 0xffc0
+; GFX11-GISEL-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    global_store_b16 v0, v2, s[0:1] dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
@@ -1664,8 +1776,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffc0, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -1698,7 +1810,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 64
+; VI-GISEL-NEXT:    v_not_b32_e32 v4, 63
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -1710,8 +1822,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 64, v3
-; VI-GISEL-NEXT:    v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v3
+; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
@@ -1792,8 +1904,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 7, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -7, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffc0, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -1826,7 +1938,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 64
+; VI-GISEL-NEXT:    v_not_b32_e32 v4, 63
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -1838,8 +1950,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 7, v3
-; VI-GISEL-NEXT:    v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, -7, v3
+; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
@@ -1933,8 +2045,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0x7b, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffff85, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -1967,7 +2079,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7b
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff85
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -1979,8 +2091,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 64, v3
-; VI-GISEL-NEXT:    v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v3
+; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
@@ -2074,7 +2186,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 7, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -7, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -2117,7 +2229,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; VI-GISEL-NEXT:    v_subrev_u16_e32 v3, 7, v3
+; VI-GISEL-NEXT:    v_add_u16_e32 v3, -7, v3
 ; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
@@ -2197,7 +2309,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 16, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -16, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -2237,11 +2349,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
 ; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 16
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, -16
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
@@ -2319,7 +2431,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0xffffc400, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x3c00, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -2359,11 +2471,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
 ; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffc400
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
@@ -2454,7 +2566,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0x4400, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffbc00, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -2494,11 +2606,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
 ; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffbc00
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
index c503d65..14ff9e0 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
@@ -457,19 +457,19 @@ entry:
 define i64 @subi_i64(i64 %a) {
 ; RV32IM-LABEL: subi_i64:
 ; RV32IM:       # %bb.0: # %entry
-; RV32IM-NEXT:    lui a2, 301
-; RV32IM-NEXT:    addi a3, a2, 1548
-; RV32IM-NEXT:    sub a2, a0, a3
-; RV32IM-NEXT:    sltu a0, a0, a3
-; RV32IM-NEXT:    sub a1, a1, a0
-; RV32IM-NEXT:    mv a0, a2
+; RV32IM-NEXT:    lui a2, 1048275
+; RV32IM-NEXT:    addi a2, a2, -1548
+; RV32IM-NEXT:    add a0, a0, a2
+; RV32IM-NEXT:    sltu a2, a0, a2
+; RV32IM-NEXT:    addi a1, a1, -1
+; RV32IM-NEXT:    add a1, a1, a2
 ; RV32IM-NEXT:    ret
 ;
 ; RV64IM-LABEL: subi_i64:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    lui a1, 301
-; RV64IM-NEXT:    addiw a1, a1, 1548
-; RV64IM-NEXT:    sub a0, a0, a1
+; RV64IM-NEXT:    lui a1, 1048275
+; RV64IM-NEXT:    addiw a1, a1, -1548
+; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sub i64 %a, 1234444
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir
index 2ef5de5..39d0ee7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir
@@ -200,8 +200,9 @@ body:             |
     ; RV32I: liveins: $x10
     ; RV32I-NEXT: {{  $}}
     ; RV32I-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
-    ; RV32I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY]], 1234
-    ; RV32I-NEXT: $x10 = COPY [[ADDI]]
+    ; RV32I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, -1234
+    ; RV32I-NEXT: [[SUB:%[0-9]+]]:gpr = SUB [[COPY]], [[ADDI]]
+    ; RV32I-NEXT: $x10 = COPY [[SUB]]
     ; RV32I-NEXT: PseudoRET implicit $x10
     %0:gprb(s32) = COPY $x10
     %1:gprb(s32) = G_CONSTANT i32 -1234
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir
index be12333..527036d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir
@@ -188,8 +188,9 @@ body:             |
     ; RV64I: liveins: $x10
     ; RV64I-NEXT: {{  $}}
     ; RV64I-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
-    ; RV64I-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[COPY]], 1234
-    ; RV64I-NEXT: $x10 = COPY [[ADDIW]]
+    ; RV64I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, -1234
+    ; RV64I-NEXT: [[SUBW:%[0-9]+]]:gpr = SUBW [[COPY]], [[ADDI]]
+    ; RV64I-NEXT: $x10 = COPY [[SUBW]]
     ; RV64I-NEXT: PseudoRET implicit $x10
     %0:gprb(s64) = COPY $x10
     %1:gprb(s32) = G_TRUNC %0(s64)
@@ -440,8 +441,9 @@ body:             |
     ; RV64I: liveins: $x10
     ; RV64I-NEXT: {{  $}}
     ; RV64I-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
-    ; RV64I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY]], 1234
-    ; RV64I-NEXT: $x10 = COPY [[ADDI]]
+    ; RV64I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, -1234
+    ; RV64I-NEXT: [[SUB:%[0-9]+]]:gpr = SUB [[COPY]], [[ADDI]]
+    ; RV64I-NEXT: $x10 = COPY [[SUB]]
     ; RV64I-NEXT: PseudoRET implicit $x10
     %0:gprb(s64) = COPY $x10
     %1:gprb(s64) = G_CONSTANT i64 -1234
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu_m-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu_m-rv64.mir
index f748f08..605830f 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu_m-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu_m-rv64.mir
@@ -47,39 +47,9 @@ body:             |
     ; RV64I-NEXT: $x10 = COPY [[DIVW]]
     ; RV64I-NEXT: PseudoRET implicit $x10
     %0:gprb(s64) = COPY $x10
-    %1:gprb(s32) = G_TRUNC %0(s64)
-    %2:gprb(s64) = COPY $x11
-    %3:gprb(s32) = G_TRUNC %2(s64)
-    %4:gprb(s32) = G_SDIV %1, %3
-    %5:gprb(s64) = G_ANYEXT %4(s32)
-    $x10 = COPY %5(s64)
-    PseudoRET implicit $x10
-
-...
----
-name:            srem_i32
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.0.entry:
-    liveins: $x10, $x11
-
-    ; RV64I-LABEL: name: srem_i32
-    ; RV64I: liveins: $x10, $x11
-    ; RV64I-NEXT: {{  $}}
-    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
-    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
-    ; RV64I-NEXT: [[REMW:%[0-9]+]]:gpr = REMW [[COPY]], [[COPY1]]
-    ; RV64I-NEXT: $x10 = COPY [[REMW]]
-    ; RV64I-NEXT: PseudoRET implicit $x10
-    %0:gprb(s64) = COPY $x10
-    %1:gprb(s32) = G_TRUNC %0(s64)
-    %2:gprb(s64) = COPY $x11
-    %3:gprb(s32) = G_TRUNC %2(s64)
-    %4:gprb(s32) = G_SREM %1, %3
-    %5:gprb(s64) = G_ANYEXT %4(s32)
-    $x10 = COPY %5(s64)
+    %1:gprb(s64) = COPY $x11
+    %2:gprb(s64) = G_DIVW %0, %1
+    $x10 = COPY %2(s64)
     PseudoRET implicit $x10
 
 ...
@@ -101,12 +71,9 @@ body:             |
     ; RV64I-NEXT: $x10 = COPY [[DIVUW]]
     ; RV64I-NEXT: PseudoRET implicit $x10
     %0:gprb(s64) = COPY $x10
-    %1:gprb(s32) = G_TRUNC %0(s64)
-    %2:gprb(s64) = COPY $x11
-    %3:gprb(s32) = G_TRUNC %2(s64)
-    %4:gprb(s32) = G_UDIV %1, %3
-    %5:gprb(s64) = G_ANYEXT %4(s32)
-    $x10 = COPY %5(s64)
+    %1:gprb(s64) = COPY $x11
+    %2:gprb(s64) = G_DIVUW %0, %1
+    $x10 = COPY %2(s64)
     PseudoRET implicit $x10
 
 ...
@@ -128,12 +95,9 @@ body:             |
     ; RV64I-NEXT: $x10 = COPY [[REMUW]]
     ; RV64I-NEXT: PseudoRET implicit $x10
     %0:gprb(s64) = COPY $x10
-    %1:gprb(s32) = G_TRUNC %0(s64)
-    %2:gprb(s64) = COPY $x11
-    %3:gprb(s32) = G_TRUNC %2(s64)
-    %4:gprb(s32) = G_UREM %1, %3
-    %5:gprb(s64) = G_ANYEXT %4(s32)
-    $x10 = COPY %5(s64)
+    %1:gprb(s64) = COPY $x11
+    %2:gprb(s64) = G_REMUW %0, %1
+    $x10 = COPY %2(s64)
     PseudoRET implicit $x10
 
 ...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir
index 5d980e7..d023789 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir
@@ -111,8 +111,8 @@ body:             |
     %2:gprb(s64) = G_ASSERT_SEXT %1, 32
     %7:gprb(s64) = G_CONSTANT i64 5
     %3:gprb(s64) = G_SEXT_INREG %2, 32
-    %4:gprb(s64) = G_CONSTANT i64 1
-    %5:gprb(s64) = G_SUB %3, %4
+    %4:gprb(s64) = G_CONSTANT i64 -1
+    %5:gprb(s64) = G_ADD %3, %4
     %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7
     G_BRCOND %26(s64), %bb.8
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir
index 27fe465..396421a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir
@@ -115,8 +115,8 @@ body:             |
     %12:gprb(s32) = G_CONSTANT i32 3
     %13:gprb(s32) = G_CONSTANT i32 4
     %14:gprb(s32) = G_CONSTANT i32 1000
-    %1:gprb(s32) = G_CONSTANT i32 1
-    %2:gprb(s32) = G_SUB %0, %1
+    %1:gprb(s32) = G_CONSTANT i32 -1
+    %2:gprb(s32) = G_ADD %0, %1
     %16:gprb(s32) = G_ICMP intpred(ugt), %2(s32), %4
     G_BRCOND %16(s32), %bb.8
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir
index 77156b9..0a08586 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir
@@ -112,8 +112,8 @@ body:             |
     %2:gprb(s64) = G_ASSERT_SEXT %1, 32
     %7:gprb(s64) = G_CONSTANT i64 5
     %3:gprb(s64) = G_SEXT_INREG %2, 32
-    %4:gprb(s64) = G_CONSTANT i64 1
-    %5:gprb(s64) = G_SUB %3, %4
+    %4:gprb(s64) = G_CONSTANT i64 -1
+    %5:gprb(s64) = G_ADD %3, %4
     %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7
     G_BRCOND %26(s64), %bb.8
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir
index 388c238..efa1a6c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir
@@ -171,8 +171,8 @@ body:             |
     %12:gprb(s32) = G_CONSTANT i32 3
     %13:gprb(s32) = G_CONSTANT i32 4
     %14:gprb(s32) = G_CONSTANT i32 1000
-    %1:gprb(s32) = G_CONSTANT i32 1
-    %2:gprb(s32) = G_SUB %0, %1
+    %1:gprb(s32) = G_CONSTANT i32 -1
+    %2:gprb(s32) = G_ADD %0, %1
     %16:gprb(s32) = G_ICMP intpred(ugt), %2(s32), %4
     G_BRCOND %16(s32), %bb.8
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir
index 09a8551..12b1517 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir
@@ -112,8 +112,8 @@ body:             |
     %2:gprb(s64) = G_ASSERT_SEXT %1, 32
     %7:gprb(s64) = G_CONSTANT i64 5
     %3:gprb(s64) = G_SEXT_INREG %2, 32
-    %4:gprb(s64) = G_CONSTANT i64 1
-    %5:gprb(s64) = G_SUB %3, %4
+    %4:gprb(s64) = G_CONSTANT i64 -1
+    %5:gprb(s64) = G_ADD %3, %4
     %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7
     G_BRCOND %26(s64), %bb.8
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir
index 183f5e5..657dd3c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir
@@ -27,16 +27,13 @@ body:             |
     ; CHECK-M-LABEL: name: sdiv_i8
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[ASHR]], [[ASHR1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SDIV]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 56
+    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64)
+    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[DIVW:%[0-9]+]]:_(s64) = G_DIVW [[ASHR]], [[ASHR1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[DIVW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -72,16 +69,13 @@ body:             |
     ; CHECK-M-LABEL: name: sdiv_i15
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
-    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[ASHR]], [[ASHR1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SDIV]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 49
+    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64)
+    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[DIVW:%[0-9]+]]:_(s64) = G_DIVW [[ASHR]], [[ASHR1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[DIVW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -117,16 +111,13 @@ body:             |
     ; CHECK-M-LABEL: name: sdiv_i16
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[ASHR]], [[ASHR1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SDIV]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64)
+    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[DIVW:%[0-9]+]]:_(s64) = G_DIVW [[ASHR]], [[ASHR1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[DIVW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -159,11 +150,8 @@ body:             |
     ; CHECK-M-LABEL: name: sdiv_i32
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[TRUNC]], [[TRUNC1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SDIV]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[DIVW:%[0-9]+]]:_(s64) = G_DIVW [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[DIVW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -343,14 +331,11 @@ body:             |
     ; CHECK-M-LABEL: name: udiv_i8
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
-    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-M-NEXT: [[DIVUW:%[0-9]+]]:_(s64) = G_DIVUW [[AND]], [[AND1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[DIVUW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -384,14 +369,11 @@ body:             |
     ; CHECK-M-LABEL: name: udiv_i15
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
-    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-M-NEXT: [[DIVUW:%[0-9]+]]:_(s64) = G_DIVUW [[AND]], [[AND1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[DIVUW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -425,14 +407,11 @@ body:             |
     ; CHECK-M-LABEL: name: udiv_i16
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
-    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-M-NEXT: [[DIVUW:%[0-9]+]]:_(s64) = G_DIVUW [[AND]], [[AND1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[DIVUW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -466,11 +445,8 @@ body:             |
     ; CHECK-M-LABEL: name: udiv_i32
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[TRUNC]], [[TRUNC1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[DIVUW:%[0-9]+]]:_(s64) = G_DIVUW [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[DIVUW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rem-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rem-rv64.mir
index cd95168..8239bb6 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rem-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rem-rv64.mir
@@ -27,16 +27,13 @@ body:             |
     ; CHECK-M-LABEL: name: srem_i8
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[ASHR]], [[ASHR1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SREM]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 56
+    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64)
+    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[ASHR]], [[ASHR1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[SREM]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -72,16 +69,13 @@ body:             |
     ; CHECK-M-LABEL: name: srem_i15
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
-    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[ASHR]], [[ASHR1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SREM]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 49
+    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64)
+    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[ASHR]], [[ASHR1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[SREM]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -117,16 +111,13 @@ body:             |
     ; CHECK-M-LABEL: name: srem_i16
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
-    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[ASHR]], [[ASHR1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SREM]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64)
+    ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64)
+    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[ASHR]], [[ASHR1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[SREM]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -159,11 +150,10 @@ body:             |
     ; CHECK-M-LABEL: name: srem_i32
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[TRUNC]], [[TRUNC1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SREM]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
+    ; CHECK-M-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
+    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[SREM]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -343,14 +333,11 @@ body:             |
     ; CHECK-M-LABEL: name: urem_i8
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
-    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-M-NEXT: [[REMUW:%[0-9]+]]:_(s64) = G_REMUW [[AND]], [[AND1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[REMUW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -384,14 +371,11 @@ body:             |
     ; CHECK-M-LABEL: name: urem_i15
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
-    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-M-NEXT: [[REMUW:%[0-9]+]]:_(s64) = G_REMUW [[AND]], [[AND1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[REMUW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -425,14 +409,11 @@ body:             |
     ; CHECK-M-LABEL: name: urem_i16
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
-    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-M-NEXT: [[REMUW:%[0-9]+]]:_(s64) = G_REMUW [[AND]], [[AND1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[REMUW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -466,11 +447,8 @@ body:             |
     ; CHECK-M-LABEL: name: urem_i32
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[TRUNC]], [[TRUNC1]]
-    ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32)
-    ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-M-NEXT: [[REMUW:%[0-9]+]]:_(s64) = G_REMUW [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: $x10 = COPY [[REMUW]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll
index 1b323fe..02f81e1 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll
@@ -143,8 +143,7 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    bltu a6, a4, .LBB7_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 0
-; CHECK-NEXT:    addi a5, a6, -32
-; CHECK-NEXT:    sll a7, a0, a5
+; CHECK-NEXT:    sll a7, a0, a6
 ; CHECK-NEXT:    j .LBB7_3
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    sll a3, a0, a2
@@ -162,8 +161,7 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    andi a6, a5, 63
 ; CHECK-NEXT:    bltu a6, a4, .LBB7_7
 ; CHECK-NEXT:  # %bb.6:
-; CHECK-NEXT:    addi a7, a6, -32
-; CHECK-NEXT:    srl a7, a1, a7
+; CHECK-NEXT:    srl a7, a1, a6
 ; CHECK-NEXT:    bnez a6, .LBB7_8
 ; CHECK-NEXT:    j .LBB7_9
 ; CHECK-NEXT:  .LBB7_7:
@@ -220,8 +218,7 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:    bltu a5, a4, .LBB9_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    addi a3, a5, -32
-; CHECK-NEXT:    srl a6, a1, a3
+; CHECK-NEXT:    srl a6, a1, a5
 ; CHECK-NEXT:    mv a3, a0
 ; CHECK-NEXT:    bnez a5, .LBB9_3
 ; CHECK-NEXT:    j .LBB9_4
@@ -255,8 +252,7 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    bltu a5, a4, .LBB9_6
 ; CHECK-NEXT:  .LBB9_8:
 ; CHECK-NEXT:    li a4, 0
-; CHECK-NEXT:    addi a6, a5, -32
-; CHECK-NEXT:    sll a0, a0, a6
+; CHECK-NEXT:    sll a0, a0, a5
 ; CHECK-NEXT:    beqz a5, .LBB9_10
 ; CHECK-NEXT:  .LBB9_9:
 ; CHECK-NEXT:    mv a1, a0
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index e9e5bec..efb7ba3 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -365,8 +365,8 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
 ; AVX2-LABEL: ext_i32_32i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
@@ -635,12 +635,12 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
 ; AVX2-LABEL: ext_i64_64i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovq %rdi, %xmm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm1
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,13,13,13,13,13,13,13,13,22,22,22,22,22,22,22,22,31,31,31,31,31,31,31,31]
 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 9f6de9a..7863de8 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -456,8 +456,8 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
 ; AVX2-LABEL: ext_i32_32i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
@@ -806,14 +806,14 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
 ; AVX2-LABEL: ext_i64_64i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovq %rdi, %xmm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm1
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,13,13,13,13,13,13,13,13,22,22,22,22,22,22,22,22,31,31,31,31,31,31,31,31]
 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
index c27b5b2..33a0946 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -256,8 +256,8 @@ define <32 x i1> @bitcast_i32_32i1(i32 %a0) {
 ; AVX2-LABEL: bitcast_i32_32i1:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/pr62014.ll b/llvm/test/CodeGen/X86/pr62014.ll
index d48aced..a1ce577 100644
--- a/llvm/test/CodeGen/X86/pr62014.ll
+++ b/llvm/test/CodeGen/X86/pr62014.ll
@@ -50,7 +50,7 @@ define <2 x i64> @select_cast_cond_multiuse_v2i64(<2 x i64> %x, <2 x i64> %y, i2
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -115,7 +115,7 @@ define <4 x i32> @select_cast_cond_multiuse_v4i32(<4 x i32> %x, <4 x i32> %y, i4
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -180,7 +180,7 @@ define <8 x i16> @select_cast_cond_multiuse_v8i16(<8 x i16> %x, <8 x i16> %y, i8
 ; AVX512F-LABEL: select_cast_cond_multiuse_v8i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    vmovdqa %xmm2, (%rsi)
@@ -193,7 +193,7 @@ define <8 x i16> @select_cast_cond_multiuse_v8i16(<8 x i16> %x, <8 x i16> %y, i8
 ; AVX512VL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdw %ymm2, %xmm2
-; AVX512VL-NEXT:    vpternlogq $226, %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm2 & (xmm0 ^ xmm1))
 ; AVX512VL-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -247,7 +247,7 @@ define <16 x i8> @select_cast_cond_multiuse_v16i8(<16 x i8> %x, <16 x i8> %y, i1
 ; AVX512F-LABEL: select_cast_cond_multiuse_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    vmovdqa %xmm2, (%rsi)
@@ -257,9 +257,9 @@ define <16 x i8> @select_cast_cond_multiuse_v16i8(<16 x i8> %x, <16 x i8> %y, i1
 ; AVX512VL-LABEL: select_cast_cond_multiuse_v16i8:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    kmovw %edi, %k1
-; AVX512VL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512VL-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512VL-NEXT:    vpternlogq $226, %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm2 & (xmm0 ^ xmm1))
 ; AVX512VL-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -340,7 +340,7 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f
 ; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
 ; AVX512F-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    vmovdqa %xmm2, (%rsi)
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 7673f09..8620651 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -2596,9 +2596,8 @@ define <16 x i8> @load_sext_16i1_to_16i8(ptr%ptr) nounwind readnone {
 ;
 ; AVX2-LABEL: load_sext_16i1_to_16i8:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    movzwl (%rdi), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpbroadcastw (%rdi), %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
@@ -2804,9 +2803,8 @@ define <32 x i8> @load_sext_32i1_to_32i8(ptr%ptr) nounwind readnone {
 ;
 ; AVX2-LABEL: load_sext_32i1_to_32i8:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastd (%rdi), %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/MC/AArch64/SVE2/aesd.s b/llvm/test/MC/AArch64/SVE2/aesd.s
index f0cbc39..44eb9b6 100644
--- a/llvm/test/MC/AArch64/SVE2/aesd.s
+++ b/llvm/test/MC/AArch64/SVE2/aesd.s
@@ -1,17 +1,17 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
-// RUN:        | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
 // RUN:   | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 
 aesd z0.b, z0.b, z31.b
 // CHECK-INST: aesd z0.b, z0.b, z31.b
 // CHECK-ENCODING: [0xe0,0xe7,0x22,0x45]
-// CHECK-ERROR: instruction requires: sve2 sve-aes
+// CHECK-ERROR: instruction requires: sve2-aes
 // CHECK-UNKNOWN: 4522e7e0 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/aese.s b/llvm/test/MC/AArch64/SVE2/aese.s
index 91af386..e64f213 100644
--- a/llvm/test/MC/AArch64/SVE2/aese.s
+++ b/llvm/test/MC/AArch64/SVE2/aese.s
@@ -1,17 +1,17 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
-// RUN:        | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
 // RUN:   | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 
 aese z0.b, z0.b, z31.b
 // CHECK-INST: aese z0.b, z0.b, z31.b
 // CHECK-ENCODING: [0xe0,0xe3,0x22,0x45]
-// CHECK-ERROR: instruction requires: sve2 sve-aes
+// CHECK-ERROR: instruction requires: sve2-aes
 // CHECK-UNKNOWN: 4522e3e0 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/aesimc.s b/llvm/test/MC/AArch64/SVE2/aesimc.s
index 8d108d4..c868ed0 100644
--- a/llvm/test/MC/AArch64/SVE2/aesimc.s
+++ b/llvm/test/MC/AArch64/SVE2/aesimc.s
@@ -1,23 +1,23 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
-// RUN:        | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
 // RUN:   | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 
 aesimc z0.b, z0.b
 // CHECK-INST: aesimc z0.b, z0.b
 // CHECK-ENCODING: [0x00,0xe4,0x20,0x45]
-// CHECK-ERROR: instruction requires: sve2 sve-aes
+// CHECK-ERROR: instruction requires: sve2-aes
 // CHECK-UNKNOWN: 4520e400 <unknown>
 
 aesimc z31.b, z31.b
 // CHECK-INST: aesimc z31.b, z31.b
 // CHECK-ENCODING: [0x1f,0xe4,0x20,0x45]
-// CHECK-ERROR: instruction requires: sve2 sve-aes
+// CHECK-ERROR: instruction requires: sve2-aes
 // CHECK-UNKNOWN: 4520e41f <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/aesmc.s b/llvm/test/MC/AArch64/SVE2/aesmc.s
index d3d8ba1..e158d2b 100644
--- a/llvm/test/MC/AArch64/SVE2/aesmc.s
+++ b/llvm/test/MC/AArch64/SVE2/aesmc.s
@@ -1,23 +1,23 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
-// RUN:        | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
 // RUN:   | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 
 aesmc z0.b, z0.b
 // CHECK-INST: aesmc z0.b, z0.b
 // CHECK-ENCODING: [0x00,0xe0,0x20,0x45]
-// CHECK-ERROR: instruction requires: sve2 sve-aes
+// CHECK-ERROR: instruction requires: sve2-aes
 // CHECK-UNKNOWN: 4520e000 <unknown>
 
 aesmc z31.b, z31.b
 // CHECK-INST: aesmc z31.b, z31.b
 // CHECK-ENCODING: [0x1f,0xe0,0x20,0x45]
-// CHECK-ERROR: instruction requires: sve2 sve-aes
+// CHECK-ERROR: instruction requires: sve2-aes
 // CHECK-UNKNOWN: 4520e01f <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s
index fd07054..966bead 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s
@@ -6,10 +6,10 @@ tbx z0.b, z1.b, z2.b
 // CHECK: error: instruction requires: sve2 or sme
 // CHECK-NEXT: tbx z0.b, z1.b, z2.b
 
-.arch armv9-a+sve-aes
-.arch armv9-a+nosve-aes
+.arch armv9-a+sve2-aes
+.arch armv9-a+nosve2-aes
 aesd z23.b, z23.b, z13.b
-// CHECK: error: instruction requires: sve-aes
+// CHECK: error: instruction requires: sve2-aes
 // CHECK-NEXT: aesd z23.b, z23.b, z13.b
 
 .arch armv9-a+sve2-sm4
diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch.s b/llvm/test/MC/AArch64/SVE2/directive-arch.s
index 529b40f..99f6198 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-arch.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-arch.s
@@ -4,7 +4,7 @@
 tbx z0.b, z1.b, z2.b
 // CHECK: tbx z0.b, z1.b, z2.b
 
-.arch armv9-a+sve-aes
+.arch armv9-a+sve2-aes
 aesd z23.b, z23.b, z13.b
 // CHECK: aesd z23.b, z23.b, z13.b
 
diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s
index 594608d..e967f5a 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s
@@ -6,10 +6,10 @@ tbx z0.b, z1.b, z2.b
 // CHECK: error: instruction requires: sve2 or sme
 // CHECK-NEXT: tbx z0.b, z1.b, z2.b
 
-.arch_extension sve-aes
-.arch_extension nosve-aes
+.arch_extension sve2-aes
+.arch_extension nosve2-aes
 aesd z23.b, z23.b, z13.b
-// CHECK: error: instruction requires: sve2 sve-aes
+// CHECK: error: instruction requires: sve2-aes
 // CHECK-NEXT: aesd z23.b, z23.b, z13.b
 
 .arch_extension sve2-sm4
diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s b/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s
index 25dbfdd..2fdbb52 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s
@@ -4,7 +4,7 @@
 tbx z0.b, z1.b, z2.b
 // CHECK: tbx z0.b, z1.b, z2.b
 
-.arch_extension sve-aes
+.arch_extension sve2-aes
 aesd z23.b, z23.b, z13.b
 // CHECK: aesd z23.b, z23.b, z13.b
 
diff --git a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s
index aec0596..9a8af63 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s
@@ -6,10 +6,10 @@ tbx z0.b, z1.b, z2.b
 // CHECK: error: instruction requires: sve2 or sme
 // CHECK-NEXT: tbx z0.b, z1.b, z2.b
 
-.cpu generic+sve2+sve-aes
-.cpu generic+nosve-aes
+.cpu generic+sve2-aes
+.cpu generic+nosve2-aes
 aesd z23.b, z23.b, z13.b
-// CHECK: error: instruction requires: sve2 sve-aes
+// CHECK: error: instruction requires: sve2-aes
 // CHECK-NEXT: aesd z23.b, z23.b, z13.b
 
 .cpu generic+sve2-sm4
diff --git a/llvm/test/MC/AArch64/SVE2/directive-cpu.s b/llvm/test/MC/AArch64/SVE2/directive-cpu.s
index a98b8b2..daa5ec5 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-cpu.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-cpu.s
@@ -4,7 +4,7 @@
 tbx z0.b, z1.b, z2.b
 // CHECK: tbx z0.b, z1.b, z2.b
 
-.cpu generic+sve2+sve-aes
+.cpu generic+sve2-aes
 aesd z23.b, z23.b, z13.b
 // CHECK: aesd z23.b, z23.b, z13.b
 
diff --git a/llvm/test/MC/AArch64/SVE2/pmullb-128.s b/llvm/test/MC/AArch64/SVE2/pmullb-128.s
index 0d56243..d48c75b 100644
--- a/llvm/test/MC/AArch64/SVE2/pmullb-128.s
+++ b/llvm/test/MC/AArch64/SVE2/pmullb-128.s
@@ -1,17 +1,17 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
-// RUN:        | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
 // RUN:   | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 
 pmullb z29.q, z30.d, z31.d
 // CHECK-INST: pmullb z29.q, z30.d, z31.d
 // CHECK-ENCODING: [0xdd,0x6b,0x1f,0x45]
-// CHECK-ERROR: instruction requires: sve2 sve-aes
+// CHECK-ERROR: instruction requires: sve2-aes
 // CHECK-UNKNOWN: 451f6bdd <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/pmullt-128.s b/llvm/test/MC/AArch64/SVE2/pmullt-128.s
index 75b6508..e1eca8d 100644
--- a/llvm/test/MC/AArch64/SVE2/pmullt-128.s
+++ b/llvm/test/MC/AArch64/SVE2/pmullt-128.s
@@ -1,17 +1,17 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
-// RUN:        | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \
 // RUN:   | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 
 pmullt z29.q, z30.d, z31.d
 // CHECK-INST: pmullt z29.q, z30.d, z31.d
 // CHECK-ENCODING: [0xdd,0x6f,0x1f,0x45]
-// CHECK-ERROR: instruction requires: sve2 sve-aes
+// CHECK-ERROR: instruction requires: sve2-aes
 // CHECK-UNKNOWN: 451f6fdd <unknown>
diff --git a/llvm/test/MC/Disassembler/RISCV/rv32-invalid-shift.txt b/llvm/test/MC/Disassembler/RISCV/rv32-invalid-shift.txt
new file mode 100644
index 0000000..f565301
--- /dev/null
+++ b/llvm/test/MC/Disassembler/RISCV/rv32-invalid-shift.txt
@@ -0,0 +1,10 @@
+# RUN: not llvm-mc -disassemble -triple=riscv32 -mattr=+c < %s 2>&1 | FileCheck %s --check-prefix=RV32
+# RUN: llvm-mc -disassemble -triple=riscv64 -mattr=+c < %s 2>&1 | FileCheck %s --check-prefix=RV64
+
+[0x13,0x9b,0xdb,0x02]
+# RV32: warning: invalid instruction encoding
+# RV64: slli    s6, s7, 45
+
+[0xfd,0x92]
+# RV32: warning: invalid instruction encoding
+# RV64: srli    a3, a3, 63
diff --git a/llvm/test/Transforms/LoopUnroll/preserve-branch-debuglocs.ll b/llvm/test/Transforms/LoopUnroll/preserve-branch-debuglocs.ll
new file mode 100644
index 0000000..623fb68
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/preserve-branch-debuglocs.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt < %s -passes=loop-unroll -S | FileCheck %s
+; RUN: opt < %s -passes=loop-unroll-full -S | FileCheck %s
+
+;; Loop Unrolling should preserve the DebugLocs of conditional branches that get
+;; optimized into unconditional branches.
+
+define i32 @_ZeqRK4int3S1_() {
+; CHECK-LABEL: define i32 @_ZeqRK4int3S1_() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND:.*]]:
+; CHECK-NEXT:    br label %[[CLEANUP:.*]], !dbg [[DBG3:![0-9]+]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    br i1 false, label %[[FOR_COND]], label %[[CLEANUP]]
+; CHECK:       [[CLEANUP]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  br i1 false, label %cleanup, label %for.body, !dbg !3
+
+for.body:                                         ; preds = %for.cond, %entry
+  br i1 false, label %for.cond, label %cleanup
+
+cleanup:                                          ; preds = %for.body, %for.cond
+  ret i32 0
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 20.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "test.cpp", directory: "/tmp")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DILocation(line: 304, column: 2, scope: !4)
+!4 = distinct !DILexicalBlock(scope: !5, file: !1, line: 304, column: 2)
+!5 = distinct !DISubprogram(name: "operator==", linkageName: "_ZeqRK4int3S1_", scope: !1, file: !1, line: 302, type: !6, scopeLine: 303, spFlags: DISPFlagDefinition, unit: !0)
+!6 = distinct !DISubroutineType(types: !7)
+!7 = !{}
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
+; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}})
+; CHECK: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; CHECK: [[DBG3]] = !DILocation(line: 304, column: 2, scope: [[META4:![0-9]+]])
+; CHECK: [[META4]] = distinct !DILexicalBlock(scope: [[META5:![0-9]+]], file: [[META1]], line: 304, column: 2)
+; CHECK: [[META5]] = distinct !DISubprogram(name: "operator==", linkageName: "_ZeqRK4int3S1_", scope: [[META1]], file: [[META1]], line: 302, type: [[META6:![0-9]+]], scopeLine: 303, spFlags: DISPFlagDefinition, unit: [[META0]])
+; CHECK: [[META6]] = distinct !DISubroutineType(types: [[META7:![0-9]+]])
+; CHECK: [[META7]] = !{}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-invariant-cond-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-invariant-cond-cost.ll
new file mode 100644
index 0000000..8df8e07
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-invariant-cond-cost.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+; Test for https://github.com/llvm/llvm-project/issues/114860.
+define void @test_invariant_cond_for_select(ptr %dst, i8 %x) #0 {
+; CHECK-LABEL: define void @test_invariant_cond_for_select(
+; CHECK-SAME: ptr [[DST:%.*]], i8 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[C_2:%.*]] = icmp sgt i64 [[IV]], 0
+; CHECK-NEXT:    [[C_2_EXT:%.*]] = zext i1 [[C_2]] to i64
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C_1]], i64 [[C_2_EXT]], i64 0
+; CHECK-NEXT:    [[SEL_TRUNC:%.*]] = trunc i64 [[SEL]] to i8
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i8 [[SEL_TRUNC]], ptr [[GEP]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 4
+; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[IV]], 14
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %c.1 = icmp eq i8 %x, 0
+  %c.2 = icmp sgt i64 %iv, 0
+  %c.2.ext = zext i1 %c.2 to i64
+  %sel = select i1 %c.1, i64 %c.2.ext, i64 0
+  %sel.trunc = trunc i64 %sel to i8
+  %gep = getelementptr inbounds i8, ptr %dst, i64 %iv
+  store i8 %sel.trunc, ptr %gep, align 1
+  %iv.next = add i64 %iv, 4
+  %ec = icmp ult i64 %iv, 14
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+64bit,+v" }
+
diff --git a/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll
new file mode 100644
index 0000000..791cab0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s
+
+define void @uitofp_preserve_nneg(ptr %result, i32 %size, float %y) {
+; CHECK-LABEL: @uitofp_preserve_nneg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[FOR_BODY_PREHEADER4:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX1]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = uitofp nneg <4 x i32> [[VEC_IND]] to <4 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP0]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[INDEX:%.*]] = zext nneg i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[RESULT:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX1]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[FOR_BODY_PREHEADER4]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER4]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[CONV:%.*]] = uitofp nneg i32 [[TMP4]] to float
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul float [[CONV]], [[Y]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = zext nneg i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[RESULT]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP5]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], 256
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = uitofp nneg i32 %iv to float
+  %val = fmul float %conv, %y
+  %idxprom = zext nneg i32 %iv to i64
+  %arrayidx = getelementptr inbounds float, ptr %result, i64 %idxprom
+  store float %val, ptr %arrayidx, align 4
+  %inc = add nuw nsw i32 %iv, 1
+  %cmp = icmp slt i32 %inc, 256
+  br i1 %cmp, label %for.body, label %for.exit
+
+for.exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/Reassociate/preserve-debugloc.ll b/llvm/test/Transforms/Reassociate/preserve-debugloc.ll
new file mode 100644
index 0000000..ff1f8ac
--- /dev/null
+++ b/llvm/test/Transforms/Reassociate/preserve-debugloc.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;; Tests that we preserve DebugLocs through reassociation of sub instructions.
+; RUN: opt < %s -passes=reassociate -S | FileCheck %s
+
+define void @foo(i64 %0) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DOTNEG:%.*]] = sub i64 0, [[TMP0]], !dbg [[DBG3:![0-9]+]]
+; CHECK-NEXT:    [[ADD_I_I:%.*]] = add i64 [[DOTNEG]], 1
+; CHECK-NEXT:    store i64 [[ADD_I_I]], ptr null, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub5.i.i = sub i64 1, %0, !dbg !4
+  %add.i.i = add i64 %sub5.i.i, 0
+  store i64 %add.i.i, ptr null, align 8
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 20.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.cpp", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !DILocation(line: 212, column: 25, scope: !5)
+!5 = distinct !DISubprogram(name: "foo", scope: !0, file: !1, line: 161, type: !6, scopeLine: 162, unit: !0, retainedNodes: !2)
+!6 = distinct !DISubroutineType(types: !2)
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}})
+; CHECK: [[DBG3]] = !DILocation(line: 212, column: 25, scope: [[META4:![0-9]+]])
+; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META0]], file: [[META1]], line: 161, type: [[META5:![0-9]+]], scopeLine: 162, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]])
+; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]])
+; CHECK: [[META6]] = !{}
+;.
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/base-atomicrmw.ll b/llvm/test/Transforms/RewriteStatepointsForGC/base-atomicrmw.ll
new file mode 100644
index 0000000..9d33a57
--- /dev/null
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/base-atomicrmw.ll
@@ -0,0 +1,14 @@
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S 2>&1 | FileCheck %s
+
+define ptr addrspace(1) @test(ptr %a, ptr addrspace(1) %b) gc "statepoint-example" {
+; CHECK-LABEL: @test
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr %a, ptr addrspace(1) %b seq_cst
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @foo, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(ptr addrspace(1) [[RES]]) ]
+; CHECK-NEXT:    [[RES_RELOCATED:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN]], i32 0, i32 0)
+; CHECK-NEXT:    ret ptr addrspace(1) [[RES_RELOCATED]]
+  %res = atomicrmw xchg ptr %a, ptr addrspace(1) %b seq_cst
+  call void @foo()
+  ret ptr addrspace(1) %res
+}
+
+declare void @foo()
diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
index e56dbd7..49aeea9 100644
--- a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
@@ -6,11 +6,7 @@ define void @store_load(ptr %ptr) {
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
-; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[PTR1]], align 4
 ; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
-; CHECK-NEXT:    store float [[LD0]], ptr [[PTR0]], align 4
-; CHECK-NEXT:    store float [[LD1]], ptr [[PTR1]], align 4
 ; CHECK-NEXT:    store <2 x float> [[VECL]], ptr [[PTR0]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -31,14 +27,8 @@ define void @store_fpext_load(ptr %ptr) {
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[PTRD0:%.*]] = getelementptr double, ptr [[PTR]], i32 0
 ; CHECK-NEXT:    [[PTRD1:%.*]] = getelementptr double, ptr [[PTR]], i32 1
-; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[PTR1]], align 4
 ; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[FPEXT0:%.*]] = fpext float [[LD0]] to double
-; CHECK-NEXT:    [[FPEXT1:%.*]] = fpext float [[LD1]] to double
 ; CHECK-NEXT:    [[VCAST:%.*]] = fpext <2 x float> [[VECL]] to <2 x double>
-; CHECK-NEXT:    store double [[FPEXT0]], ptr [[PTRD0]], align 8
-; CHECK-NEXT:    store double [[FPEXT1]], ptr [[PTRD1]], align 8
 ; CHECK-NEXT:    store <2 x double> [[VCAST]], ptr [[PTRD0]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -62,20 +52,10 @@ define void @store_fcmp_zext_load(ptr %ptr) {
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[PTRB0:%.*]] = getelementptr i32, ptr [[PTR]], i32 0
 ; CHECK-NEXT:    [[PTRB1:%.*]] = getelementptr i32, ptr [[PTR]], i32 1
-; CHECK-NEXT:    [[LDB0:%.*]] = load float, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[LDB1:%.*]] = load float, ptr [[PTR1]], align 4
 ; CHECK-NEXT:    [[VECL1:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[LDA0:%.*]] = load float, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[LDA1:%.*]] = load float, ptr [[PTR1]], align 4
 ; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[FCMP0:%.*]] = fcmp ogt float [[LDA0]], [[LDB0]]
-; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp ogt float [[LDA1]], [[LDB1]]
 ; CHECK-NEXT:    [[VCMP:%.*]] = fcmp ogt <2 x float> [[VECL]], [[VECL1]]
-; CHECK-NEXT:    [[ZEXT0:%.*]] = zext i1 [[FCMP0]] to i32
-; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i1 [[FCMP1]] to i32
 ; CHECK-NEXT:    [[VCAST:%.*]] = zext <2 x i1> [[VCMP]] to <2 x i32>
-; CHECK-NEXT:    store i32 [[ZEXT0]], ptr [[PTRB0]], align 4
-; CHECK-NEXT:    store i32 [[ZEXT1]], ptr [[PTRB1]], align 4
 ; CHECK-NEXT:    store <2 x i32> [[VCAST]], ptr [[PTRB0]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -101,17 +81,9 @@ define void @store_fadd_load(ptr %ptr) {
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
-; CHECK-NEXT:    [[LDA0:%.*]] = load float, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[LDA1:%.*]] = load float, ptr [[PTR1]], align 4
 ; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[LDB0:%.*]] = load float, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[LDB1:%.*]] = load float, ptr [[PTR1]], align 4
 ; CHECK-NEXT:    [[VECL1:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[FADD0:%.*]] = fadd float [[LDA0]], [[LDB0]]
-; CHECK-NEXT:    [[FADD1:%.*]] = fadd float [[LDA1]], [[LDB1]]
 ; CHECK-NEXT:    [[VEC:%.*]] = fadd <2 x float> [[VECL]], [[VECL1]]
-; CHECK-NEXT:    store float [[FADD0]], ptr [[PTR0]], align 4
-; CHECK-NEXT:    store float [[FADD1]], ptr [[PTR1]], align 4
 ; CHECK-NEXT:    store <2 x float> [[VEC]], ptr [[PTR0]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -133,14 +105,8 @@ define void @store_fneg_load(ptr %ptr) {
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
-; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[PTR1]], align 4
 ; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
-; CHECK-NEXT:    [[FNEG0:%.*]] = fneg float [[LD0]]
-; CHECK-NEXT:    [[FNEG1:%.*]] = fneg float [[LD1]]
 ; CHECK-NEXT:    [[VEC:%.*]] = fneg <2 x float> [[VECL]]
-; CHECK-NEXT:    store float [[FNEG0]], ptr [[PTR0]], align 4
-; CHECK-NEXT:    store float [[FNEG1]], ptr [[PTR1]], align 4
 ; CHECK-NEXT:    store <2 x float> [[VEC]], ptr [[PTR0]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -155,3 +121,25 @@ define void @store_fneg_load(ptr %ptr) {
   ret void
 }
 
+define float @scalars_with_external_uses_not_dead(ptr %ptr) {
+; CHECK-LABEL: define float @scalars_with_external_uses_not_dead(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
+; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[PTR0]], align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[PTR1]], align 4
+; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
+; CHECK-NEXT:    store <2 x float> [[VECL]], ptr [[PTR0]], align 4
+; CHECK-NEXT:    [[USER:%.*]] = fneg float [[LD1]]
+; CHECK-NEXT:    ret float [[LD0]]
+;
+  %ptr0 = getelementptr float, ptr %ptr, i32 0
+  %ptr1 = getelementptr float, ptr %ptr, i32 1
+  %ld0 = load float, ptr %ptr0
+  %ld1 = load float, ptr %ptr1
+  store float %ld0, ptr %ptr0
+  store float %ld1, ptr %ptr1
+  %user = fneg float %ld1
+  ret float %ld0
+}
+
diff --git a/llvm/test/tools/llvm-reduce/reduce-flags.ll b/llvm/test/tools/llvm-reduce/reduce-flags.ll
index 293504e3..f220fc4 100644
--- a/llvm/test/tools/llvm-reduce/reduce-flags.ll
+++ b/llvm/test/tools/llvm-reduce/reduce-flags.ll
@@ -272,3 +272,19 @@ define i32 @trunc_nsw_keep(i64 %a) {
   %op = trunc nsw i64 %a to i32
   ret i32 %op
 }
+
+; CHECK-LABEL: @icmp_samesign_drop(
+; INTERESTING: = icmp
+; RESULT: icmp ult i32
+define i1 @icmp_samesign_drop(i32 %a) {
+  %op = icmp samesign ult i32 %a, 10
+  ret i1 %op
+}
+
+; CHECK-LABEL: @icmp_samesign_keep(
+; INTERESTING: = icmp samesign
+; RESULT: icmp samesign ult i32
+define i1 @icmp_samesign_keep(i32 %a) {
+  %op = icmp samesign ult i32 %a, 10
+  ret i1 %op
+}
diff --git a/llvm/test/tools/obj2yaml/ELF/verdef-section.yaml b/llvm/test/tools/obj2yaml/ELF/verdef-section.yaml
index fc3e0f7..f082e897 100644
--- a/llvm/test/tools/obj2yaml/ELF/verdef-section.yaml
+++ b/llvm/test/tools/obj2yaml/ELF/verdef-section.yaml
@@ -48,12 +48,14 @@ Sections:
         Flags:      0
         VersionNdx: 0
         Hash:       0
+        VDAux:      20
         Names:
           - VERSION_0
 ## An entry with arbitrary values.
       - Flags:      2
         VersionNdx: 2
         Hash:       108387921
+        VDAux:      [[VDAUX=20]]
         Names:
           - VERSION_1
 ## Another entry with arbitrary values and version predecessors.
@@ -81,3 +83,9 @@ DynamicSymbols: []
 # RUN:   FileCheck %s -DFILE=%t.version --check-prefix=VERSION-ERR
 
 # VERSION-ERR: Error reading file: [[FILE]]: invalid SHT_GNU_verdef section version: 2
+
+# RUN: yaml2obj %s -DVDAUX=100 -o %t.vdaux
+# RUN: not obj2yaml %t.vdaux 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t.vdaux --check-prefix=VDAUX-ERR
+
+# VDAUX-ERR: Error reading file: [[FILE]]: corrupted section: vd_aux value 100 in section verdef points past end of the section
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.cpp b/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.cpp
index ff9dde40..e157747 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.cpp
@@ -42,6 +42,9 @@ static void reduceFlagsInModule(Oracle &O, ReducerWorkItem &WorkItem) {
       } else if (auto *PDI = dyn_cast<PossiblyDisjointInst>(&I)) {
         if (PDI->isDisjoint() && !O.shouldKeep())
           PDI->setIsDisjoint(false);
+      } else if (auto *ICmp = dyn_cast<ICmpInst>(&I)) {
+        if (ICmp->hasSameSign() && !O.shouldKeep())
+          ICmp->setSameSign(false);
       } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
         GEPNoWrapFlags NW = GEP->getNoWrapFlags();
         if (NW.isInBounds() && !O.shouldKeep())
diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp
index 9b4644b..b1c8032 100644
--- a/llvm/tools/obj2yaml/elf2yaml.cpp
+++ b/llvm/tools/obj2yaml/elf2yaml.cpp
@@ -1451,7 +1451,15 @@ ELFDumper<ELFT>::dumpVerdefSection(const Elf_Shdr *Shdr) {
     if (Verdef->vd_hash != 0)
       Entry.Hash = Verdef->vd_hash;
 
+    if (Verdef->vd_aux != sizeof(Elf_Verdef))
+      Entry.VDAux = Verdef->vd_aux;
+
     const uint8_t *BufAux = Buf + Verdef->vd_aux;
+    if (BufAux > Data.end())
+      return createStringError(
+          errc::invalid_argument,
+          "corrupted section: vd_aux value " + Twine(Verdef->vd_aux) +
+              " in section verdef points past end of the section");
     while (BufAux) {
       const Elf_Verdaux *Verdaux =
           reinterpret_cast<const Elf_Verdaux *>(BufAux);
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 4bdb6ee..d69b2d6 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1297,7 +1297,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SIMD,         AArch64::AEK_FP16,
       AArch64::AEK_FP16FML,      AArch64::AEK_PROFILE,
       AArch64::AEK_RAS,          AArch64::AEK_SVE,
-      AArch64::AEK_SVE2,         AArch64::AEK_ALIAS_SVE2AES,
+      AArch64::AEK_SVE2,         AArch64::AEK_SVE2AES,
       AArch64::AEK_SVE2SM4,      AArch64::AEK_SVE2SHA3,
       AArch64::AEK_SVE2BITPERM,  AArch64::AEK_RCPC,
       AArch64::AEK_RAND,         AArch64::AEK_MTE,
@@ -1334,7 +1334,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_FPRCVT,       AArch64::AEK_CMPBR,
       AArch64::AEK_LSUI,         AArch64::AEK_OCCMO,
       AArch64::AEK_PCDPHINT,     AArch64::AEK_POPS,
-      AArch64::AEK_SVEAES};
+  };
 
   std::vector<StringRef> Features;
 
@@ -1369,7 +1369,6 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+sve-bfscale"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve-f16f32mm"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2"));
-  EXPECT_TRUE(llvm::is_contained(Features, "+sve-aes"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2-aes"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sm4"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sha3"));
@@ -1539,7 +1538,6 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"sve-bfscale", "nosve-bfscale", "+sve-bfscale", "-sve-bfscale"},
       {"sve-f16f32mm", "nosve-f16f32mm", "+sve-f16f32mm", "-sve-f16f32mm"},
       {"sve2", "nosve2", "+sve2", "-sve2"},
-      {"sve-aes", "nosve-aes", "+sve-aes", "-sve-aes"},
       {"sve2-aes", "nosve2-aes", "+sve2-aes", "-sve2-aes"},
       {"sve2-sm4", "nosve2-sm4", "+sve2-sm4", "-sve2-sm4"},
       {"sve2-sha3", "nosve2-sha3", "+sve2-sha3", "-sve2-sha3"},
@@ -1842,11 +1840,7 @@ AArch64ExtensionDependenciesBaseArchTestParams
          {},
          {"sve", "sve-f16f32mm"}},
 
-        // aes -> {sve-aes}
-        {AArch64::ARMV8A, {"noaes", "sve-aes"}, {"aes", "sve-aes"}, {}},
-        {AArch64::ARMV8A, {"sve-aes", "noaes"}, {}, {"aes", "sve-aes"}},
-
-        // sve2 -> {sve2p1, sve2-bitperm, sve2-sha3, sve2-sm4, sve2-aes}
+        // sve2 -> {sve2p1, sve2-bitperm, sve2-sha3, sve2-sm4}
         {AArch64::ARMV8A, {"nosve2", "sve2p1"}, {"sve2", "sve2p1"}, {}},
         {AArch64::ARMV8A, {"sve2p1", "nosve2"}, {}, {"sve2", "sve2p1"}},
         {AArch64::ARMV8A,
@@ -1861,8 +1855,6 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV8A, {"sve2-sha3", "nosve2"}, {}, {"sve2", "sve2-sha3"}},
         {AArch64::ARMV8A, {"nosve2", "sve2-sm4"}, {"sve2", "sve2-sm4"}, {}},
         {AArch64::ARMV8A, {"sve2-sm4", "nosve2"}, {}, {"sve2", "sve2-sm4"}},
-        {AArch64::ARMV8A, {"nosve2", "sve2-aes"}, {"sve2", "sve2-aes"}, {}},
-        {AArch64::ARMV8A, {"sve2-aes", "nosve2"}, {}, {"sve2", "sve2-aes"}},
 
         // sve-b16b16 -> {sme-b16b16}
         {AArch64::ARMV9_4A,
@@ -1963,23 +1955,16 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV8A, {"norcpc", "rcpc3"}, {"rcpc", "rcpc3"}, {}},
         {AArch64::ARMV8A, {"rcpc3", "norcpc"}, {}, {"rcpc", "rcpc3"}},
 
-        // sve-aes -> {ssve-aes, sve2-aes}
+        // sve2-aes -> ssve-aes
         {AArch64::ARMV9_6A,
-         {"nosve-aes", "ssve-aes"},
-         {"sve-aes", "ssve-aes"},
+         {"nosve2-aes", "ssve-aes"},
+         {"sve2-aes", "ssve-aes"},
          {}},
         {AArch64::ARMV9_6A,
-         {"ssve-aes", "nosve-aes"},
+         {"ssve-aes", "nosve2-aes"},
          {},
-         {"ssve-aes", "sve-aes"}},
-        {AArch64::ARMV9_6A,
-         {"nosve-aes", "sve2-aes"},
-         {"sve2-aes", "sve-aes"},
-         {}},
-        {AArch64::ARMV9_6A,
-         {"sve2-aes", "nosve-aes"},
-         {},
-         {"sve2-aes", "sve-aes"}}};
+         {"ssve-aes", "sve2-aes"}},
+};
 
 INSTANTIATE_TEST_SUITE_P(
     AArch64ExtensionDependenciesBaseArch,
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index ee88ca5..b0ea1f7 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -3476,6 +3476,243 @@ structured_op: !LinalgStructuredOpConfig
                   scalar_arg: K
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
+  name: conv_2d_nhwgc_gfhwc
+  cpp_class_name: Conv2DNhwgcGfhwcOp
+  doc: |-
+    Performs 2-D grouped convolution.
+
+    Layout:
+      * Input: NHWGC.
+      * Kernel: GFHWC.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1 * s2 + s3 * s4, s5 * s6 + s7 * s8, s9, s10)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s9, s11, s3, s7, s10)>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1, s5, s9, s11)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s2, s6)>
+    default_indices:
+    - 1
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s4, s8)>
+    default_indices:
+    - 1
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1 * s2 + d5 * s4, d2 * s6 + d6 * s8, d3, d7)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d3, d4, d5, d6, d7)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1, d2, d3, d4)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: binary
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              scalar_fn:
+                kind: type
+                fn_name: cast_signed
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: I
+            - !ScalarExpression
+              scalar_fn:
+                kind: type
+                fn_name: cast_signed
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: K
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: conv_2d_nhwgc_gfhwc_q
+  cpp_class_name: Conv2DNhwgcGfhwcQOp
+  doc: |-
+    Performs 2-D grouped convolution with zero point offsets.
+
+    Layout:
+      * Input: NHWGC.
+      * Kernel: GFHWC.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output. This includes the zero
+    point offsets common to quantized operations.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1 * s2 + s3 * s4, s5 * s6 + s7 * s8, s9, s10)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s9, s11, s3, s7, s10)>
+  - !LinalgOperandDefConfig
+    name: IZp
+    kind: scalar
+    type_var: I32
+  - !LinalgOperandDefConfig
+    name: KZp
+    kind: scalar
+    type_var: I32
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1, s5, s9, s11)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s2, s6)>
+    default_indices:
+    - 1
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s4, s8)>
+    default_indices:
+    - 1
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1 * s2 + d5 * s4, d2 * s6 + d6 * s8, d3, d7)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d3, d4, d5, d6, d7)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1, d2, d3, d4)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: binary
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              scalar_fn:
+                kind: binary
+                fn_name: sub
+                operands:
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: I
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: IZp
+            - !ScalarExpression
+              scalar_fn:
+                kind: binary
+                fn_name: sub
+                operands:
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: K
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: KZp
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
   name: conv_2d_ngchw_gfchw_q
   cpp_class_name: Conv2DNgchwGfchwQOp
   doc: |-
diff --git a/mlir/include/mlir/IR/DialectRegistry.h b/mlir/include/mlir/IR/DialectRegistry.h
index 2c1f696..d3d5348 100644
--- a/mlir/include/mlir/IR/DialectRegistry.h
+++ b/mlir/include/mlir/IR/DialectRegistry.h
@@ -138,7 +138,8 @@ bool hasPromisedInterface(Dialect &dialect) {
 /// encountered.
 class DialectRegistry {
   using MapTy =
-      std::map<std::string, std::pair<TypeID, DialectAllocatorFunction>>;
+      std::map<std::string, std::pair<TypeID, DialectAllocatorFunction>,
+               std::less<>>;
 
 public:
   explicit DialectRegistry();
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index 4e7758b..ce91424 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -524,8 +524,11 @@ LLVMTypeConverter::getMemRefAddressSpace(BaseMemRefType type) const {
     return failure();
   if (!(*converted)) // Conversion to default is 0.
     return 0;
-  if (auto explicitSpace = llvm::dyn_cast_if_present<IntegerAttr>(*converted))
-    return explicitSpace.getInt();
+  if (auto explicitSpace = dyn_cast_if_present<IntegerAttr>(*converted)) {
+    if (explicitSpace.getType().isIndex() ||
+        explicitSpace.getType().isSignlessInteger())
+      return explicitSpace.getInt();
+  }
   return failure();
 }
 
diff --git a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
index e48ca51..a640839 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
@@ -74,6 +74,12 @@ std::tuple<Value, Value> AllocationOpLLVMLowering::allocateBufferManuallyAlign(
   MemRefType memRefType = getMemRefResultType(op);
   // Allocate the underlying buffer.
   Type elementPtrType = this->getElementPtrType(memRefType);
+  if (!elementPtrType) {
+    emitError(loc, "conversion of memref memory space ")
+        << memRefType.getMemorySpace()
+        << " to integer address space "
+           "failed. Consider adding memory space conversions.";
+  }
   LLVM::LLVMFuncOp allocFuncOp = getNotalignedAllocFn(
       getTypeConverter(), op->getParentWithTrait<OpTrait::SymbolTable>(),
       getIndexType());
diff --git a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
index 8682294..f3413c1 100644
--- a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
@@ -42,6 +42,12 @@ void arith::ConstantOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
   }
   if (auto arrayCstAttr =
           llvm::dyn_cast_or_null<DenseIntElementsAttr>(getValue())) {
+    if (arrayCstAttr.isSplat()) {
+      setResultRange(getResult(), ConstantIntRanges::constant(
+                                      arrayCstAttr.getSplatValue<APInt>()));
+      return;
+    }
+
     std::optional<ConstantIntRanges> result;
     for (const APInt &val : arrayCstAttr) {
       auto range = ConstantIntRanges::constant(val);
diff --git a/mlir/lib/IR/Dialect.cpp b/mlir/lib/IR/Dialect.cpp
index cc80677..4756859 100644
--- a/mlir/lib/IR/Dialect.cpp
+++ b/mlir/lib/IR/Dialect.cpp
@@ -217,7 +217,7 @@ DialectRegistry::DialectRegistry() { insert<BuiltinDialect>(); }
 
 DialectAllocatorFunctionRef
 DialectRegistry::getDialectAllocator(StringRef name) const {
-  auto it = registry.find(name.str());
+  auto it = registry.find(name);
   if (it == registry.end())
     return nullptr;
   return it->second.second;
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index d33340f..b9e745f 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -986,8 +986,7 @@ void RegisteredOperationName::insert(
   }
   StringRef name = impl->getName().strref();
   // Insert the operation info if it doesn't exist yet.
-  auto it = ctxImpl.operations.insert({name, nullptr});
-  it.first->second = std::move(ownedImpl);
+  ctxImpl.operations[name] = std::move(ownedImpl);
 
   // Update the registered info for this operation.
   auto emplaced = ctxImpl.registeredOperations.try_emplace(
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index 5c1c984..c95cd5e 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -965,6 +965,67 @@ def conv_2d_ngchw_gfchw(
 
 
 @linalg_structured_op
+def conv_2d_nhwgc_gfhwc(
+    I=TensorDef(
+        T1, S.N, S.OH * S.SH + S.KH * S.DH, S.OW * S.SW + S.KW * S.DW, S.G, S.C
+    ),
+    K=TensorDef(T2, S.G, S.FG, S.KH, S.KW, S.C),
+    O=TensorDef(U, S.N, S.OH, S.OW, S.G, S.FG, output=True),
+    strides=IndexAttrDef(S.SH, S.SW, default=[1, 1]),
+    dilations=IndexAttrDef(S.DH, S.DW, default=[1, 1]),
+):
+    """Performs 2-D grouped convolution.
+
+    Layout:
+      * Input: NHWGC.
+      * Kernel: GFHWC.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output.
+    """
+    implements(ConvolutionOpInterface)
+    domain(D.n, D.oh, D.ow, D.g, D.fg, D.kh, D.kw, D.c)
+    O[D.n, D.oh, D.ow, D.g, D.fg] += TypeFn.cast_signed(
+        U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.g, D.c]
+    ) * TypeFn.cast_signed(U, K[D.g, D.fg, D.kh, D.kw, D.c])
+
+
+@linalg_structured_op
+def conv_2d_nhwgc_gfhwc_q(
+    I=TensorDef(
+        T1, S.N, S.OH * S.SH + S.KH * S.DH, S.OW * S.SW + S.KW * S.DW, S.G, S.C
+    ),
+    K=TensorDef(T2, S.G, S.FG, S.KH, S.KW, S.C),
+    IZp=ScalarDef(I32),
+    KZp=ScalarDef(I32),
+    O=TensorDef(U, S.N, S.OH, S.OW, S.G, S.FG, output=True),
+    strides=IndexAttrDef(S.SH, S.SW, default=[1, 1]),
+    dilations=IndexAttrDef(S.DH, S.DW, default=[1, 1]),
+):
+    """Performs 2-D grouped convolution with zero point offsets.
+
+    Layout:
+      * Input: NHWGC.
+      * Kernel: GFHWC.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output. This includes the zero
+    point offsets common to quantized operations.
+    """
+    implements(ConvolutionOpInterface)
+    domain(D.n, D.oh, D.ow, D.g, D.fg, D.kh, D.kw, D.c)
+    O[D.n, D.oh, D.ow, D.g, D.fg] += (
+        TypeFn.cast_signed(
+            U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.g, D.c]
+        )
+        - TypeFn.cast_signed(U, IZp)
+    ) * (
+        TypeFn.cast_signed(U, K[D.g, D.fg, D.kh, D.kw, D.c])
+        - TypeFn.cast_signed(U, KZp)
+    )
+
+
+@linalg_structured_op
 def conv_2d_ngchw_gfchw_q(
     I=TensorDef(
         T1, S.N, S.G, S.C, S.OH * S.SH + S.KH * S.DH, S.OW * S.SW + S.KW * S.DW
diff --git a/mlir/test/Conversion/MemRefToLLVM/invalid-uint.mlir b/mlir/test/Conversion/MemRefToLLVM/invalid-uint.mlir
new file mode 100644
index 0000000..7e94677
--- /dev/null
+++ b/mlir/test/Conversion/MemRefToLLVM/invalid-uint.mlir
@@ -0,0 +1,8 @@
+// RUN: mlir-opt %s -finalize-memref-to-llvm -verify-diagnostics
+
+// CHECK-LABEL: @invalid_int_conversion
+func.func @invalid_int_conversion() {
+     // expected-error@+1 {{conversion of memref memory space 1 : ui64 to integer address space failed. Consider adding memory space conversions.}}
+     %alloc = memref.alloc() {alignment = 64 : i64} : memref<10xf32, 1 : ui64> 
+    return
+}
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 65c18de..68aa5a8 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -409,6 +409,38 @@ func.func @conv_2d_ngchw_fgchw(%input: memref<?x?x?x?x?xf32>, %filter: memref<?x
 
 // -----
 
+// CHECK-LABEL: func @conv_2d_nhwgc_gfhwc
+func.func @conv_2d_nhwgc_gfhwc(%input: memref<?x?x?x?x?xf32>, %filter: memref<?x?x?x?x?xf32>, %output: memref<?x?x?x?x?xf32>) {
+  // CHECK:      linalg.conv_2d_nhwgc_gfhwc
+  // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
+  // CHECK-SAME:   outs(%{{.+}} : memref<?x?x?x?x?xf32>)
+  linalg.conv_2d_nhwgc_gfhwc {dilations = dense<1> : tensor<2xi64>,
+                                         strides = dense<1> : tensor<2xi64>}
+     ins (%input, %filter: memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
+    outs (%output: memref<?x?x?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @conv_2d_nhwgc_gfhwc_tensor
+func.func @conv_2d_nhwgc_gfhwc_tensor(%input: tensor<1x28x28x2x3xf32>, %filter: tensor<2x8x3x3x3xf32>, %output: tensor<1x26x26x2x8xf32>) -> tensor<1x26x26x2x8xf32>  {
+  // CHECK:      linalg.conv_2d_nhwgc_gfhwc
+  // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<1x28x28x2x3xf32>, tensor<2x8x3x3x3xf32>)
+  // CHECK-SAME:   outs(%{{.+}} : tensor<1x26x26x2x8xf32>) -> tensor<1x26x26x2x8xf32>
+  %0 = linalg.conv_2d_nhwgc_gfhwc {dilations = dense<1> : tensor<2xi64>,
+                                         strides = dense<1> : tensor<2xi64>}
+     ins (%input, %filter: tensor<1x28x28x2x3xf32>, tensor<2x8x3x3x3xf32>)
+    outs (%output: tensor<1x26x26x2x8xf32>) -> tensor<1x26x26x2x8xf32>
+  return  %0 : tensor<1x26x26x2x8xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @conv_2d_ngchw_fgchw_dimensions
 func.func @conv_2d_ngchw_fgchw_dimensions(%input: tensor<1x5x3x32x32xf32>, %filter: tensor<2x5x3x3x3xf32>, %init: tensor<1x5x2x30x30xf32>) -> tensor<1x5x2x30x30xf32> {
   // CHECK:      linalg.conv_2d_ngchw_fgchw
diff --git a/mlir/test/Dialect/Vector/int-range-interface.mlir b/mlir/test/Dialect/Vector/int-range-interface.mlir
index 09dfe93..0263193 100644
--- a/mlir/test/Dialect/Vector/int-range-interface.mlir
+++ b/mlir/test/Dialect/Vector/int-range-interface.mlir
@@ -17,6 +17,13 @@ func.func @constant_splat() -> vector<8xi32> {
   func.return %1 : vector<8xi32>
 }
 
+// CHECK-LABEL: func @float_constant_splat
+// Don't crash on splat floats.
+func.func @float_constant_splat() -> vector<8xf32> {
+  %0 = arith.constant dense<3.0> : vector<8xf32>
+  func.return %0: vector<8xf32>
+}
+
 // CHECK-LABEL: func @vector_splat
 // CHECK: test.reflect_bounds {smax = 5 : index, smin = 4 : index, umax = 5 : index, umin = 4 : index}
 func.func @vector_splat() -> vector<4xindex> {
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 20447d59..7206995 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -32,6 +32,7 @@ exports_files([
     "include/llvm/IR/Intrinsics.td",
     "include/llvm/Option/OptParser.td",
     "utils/lit/lit.py",
+    "utils/lldbDataFormatters.py",
 ])
 
 # It may be tempting to add compiler flags here, but that should be avoided.
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 1e7319e..86817be 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -9969,6 +9969,8 @@ cc_binary(
         ":OpenMPToLLVMIRTranslation",
         ":ToLLVMIRTranslation",
         "//llvm:AsmParser",
+        "//llvm:Core",
+        "//llvm:Linker",
         "//llvm:Support",
         "//llvm:X86AsmParser",
     ],
@@ -10093,34 +10095,6 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "mlir-spirv-cpu-runner",
-    srcs = ["tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp"],
-    deps = [
-        ":ArithDialect",
-        ":BuiltinToLLVMIRTranslation",
-        ":ExecutionEngineUtils",
-        ":FuncDialect",
-        ":FuncToLLVM",
-        ":GPUDialect",
-        ":GPUToSPIRV",
-        ":GPUTransforms",
-        ":LLVMDialect",
-        ":LLVMToLLVMIRTranslation",
-        ":MemRefDialect",
-        ":MlirJitRunner",
-        ":Pass",
-        ":SPIRVConversion",
-        ":SPIRVDialect",
-        ":SPIRVToLLVM",
-        ":SPIRVTransforms",
-        ":ToLLVMIRTranslation",
-        "//llvm:Core",
-        "//llvm:Linker",
-        "//llvm:Support",
-    ],
-)
-
 cc_library(
     name = "TableGen",
     srcs = glob(["lib/TableGen/*.cpp"]),
author	Amir Ayupov <aaupov@fb.com>	2024-11-08 15:25:36 -0800
committer	Amir Ayupov <aaupov@fb.com>	2024-11-08 15:25:36 -0800
commit	2502434bf541db29b813c355f1b679bd07f0def5 (patch)
tree	49c4a44014dd132e062e4ee5836857d70a0690be
parent	c84de4215823f46498a049cca8adcbb6ca6e0f83 (diff)
parent	d936924f5e22e8efbc27873f62e8dfc6e410fcf9 (diff)
download	llvm-users/shawbyoung/spr/main.bolt-match-blocks-with-pseudo-probes.zip llvm-users/shawbyoung/spr/main.bolt-match-blocks-with-pseudo-probes.tar.gz llvm-users/shawbyoung/spr/main.bolt-match-blocks-with-pseudo-probes.tar.bz2