[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912

Created using spr 1.3.4 [skip ci]
author: Xiaofeng Tian <110771974+txff99@users.noreply.github.com> 2024-09-11 14:00:54 -0700
committer: Vitaly Buka <vitalybuka@google.com> 2024-09-11 14:00:54 -0700
commit: 8ee4ba92e1a85fd220caf2f83d5d86c334d432db (patch)
tree: 847bf2e4f29d84e1369a9d4ef2d1e69c637a36de
parent: 01006d0bc93e5c6a6953ea8874eadb56ec719399 (diff)
parent: c2b93e0671d8cfd6b1a24c6e1d7be290125b8974 (diff)
download: llvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.zip
llvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.tar.gz
llvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.tar.bz2
288 files changed, 10031 insertions, 7117 deletions
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
index 8116db5..98c592d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
@@ -187,14 +187,14 @@ void lambda_value_reference_auxiliary_var(T&& t) {
 namespace deleted_functions {
 
 template <typename T>
-void f(T &&) = delete;
+void f(T &&t) = delete;
 
 struct S {
     template <typename T>
-    S(T &&) = delete;
+    S(T &&t) = delete;
 
     template <typename T>
-    void operator&(T &&) = delete;
+    void operator&(T &&t) = delete;
 };
 
 } // namespace deleted_functions
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 2f119fe..ec617ee 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -500,7 +500,7 @@ bool AArch64SwiftABIInfo::isLegalVectorType(CharUnits VectorSize,
 bool AArch64ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
   // For the soft-float ABI variant, no types are considered to be homogeneous
   // aggregates.
-  if (Kind == AArch64ABIKind::AAPCSSoft)
+  if (isSoftFloat())
     return false;
 
   // Homogeneous aggregates for AAPCS64 must have base types of a floating
@@ -555,8 +555,8 @@ RValue AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty,
     BaseTy = ArrTy->getElementType();
     NumRegs = ArrTy->getNumElements();
   }
-  bool IsFPR = Kind != AArch64ABIKind::AAPCSSoft &&
-               (BaseTy->isFloatingPointTy() || BaseTy->isVectorTy());
+  bool IsFPR =
+      !isSoftFloat() && (BaseTy->isFloatingPointTy() || BaseTy->isVectorTy());
 
   // The AArch64 va_list type and handling is specified in the Procedure Call
   // Standard, section B.4:
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index 071e64f..da7bbf8 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -525,6 +525,16 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
         .addArraySubscriptOperators()
         .completeDefinition();
   });
+
+  Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "StructuredBuffer")
+             .addSimpleTemplateParams(*SemaPtr, {"element_type"})
+             .Record;
+  onCompletion(Decl, [this](CXXRecordDecl *Decl) {
+    setupBufferType(Decl, *SemaPtr, ResourceClass::UAV,
+                    ResourceKind::TypedBuffer, /*IsROV=*/false)
+        .addArraySubscriptOperators()
+        .completeDefinition();
+  });
 }
 
 void HLSLExternalSemaSource::onCompletion(CXXRecordDecl *Record,
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 49bbff1..f48b2fd 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -143,6 +143,16 @@ bool isReturnValueRefCounted(const clang::FunctionDecl *F) {
   return false;
 }
 
+std::optional<bool> isUncounted(const QualType T) {
+  if (auto *Subst = dyn_cast<SubstTemplateTypeParmType>(T)) {
+    if (auto *Decl = Subst->getAssociatedDecl()) {
+      if (isRefType(safeGetName(Decl)))
+        return false;
+    }
+  }
+  return isUncounted(T->getAsCXXRecordDecl());
+}
+
 std::optional<bool> isUncounted(const CXXRecordDecl* Class)
 {
   // Keep isRefCounted first as it's cheaper.
@@ -231,11 +241,9 @@ bool isSingleton(const FunctionDecl *F) {
     if (!MethodDecl->isStatic())
       return false;
   }
-  const auto &Name = safeGetName(F);
-  std::string SingletonStr = "singleton";
-  auto index = Name.find(SingletonStr);
-  return index != std::string::npos &&
-         index == Name.size() - SingletonStr.size();
+  const auto &NameStr = safeGetName(F);
+  StringRef Name = NameStr; // FIXME: Make safeGetName return StringRef.
+  return Name == "singleton" || Name.ends_with("Singleton");
 }
 
 // We only care about statements so let's use the simple
@@ -397,6 +405,7 @@ public:
       return true;
 
     if (Name == "WTFCrashWithInfo" || Name == "WTFBreakpointTrap" ||
+        Name == "WTFReportBacktrace" ||
         Name == "WTFCrashWithSecurityImplication" || Name == "WTFCrash" ||
         Name == "WTFReportAssertionFailure" || Name == "isMainThread" ||
         Name == "isMainThreadOrGCThread" || Name == "isMainRunLoop" ||
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index ec1db1c..2932e62 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -20,6 +20,7 @@ class CXXMethodDecl;
 class CXXRecordDecl;
 class Decl;
 class FunctionDecl;
+class QualType;
 class Stmt;
 class Type;
 
@@ -44,6 +45,10 @@ bool isRefCounted(const clang::CXXRecordDecl *Class);
 
 /// \returns true if \p Class is ref-countable AND not ref-counted, false if
 /// not, std::nullopt if inconclusive.
+std::optional<bool> isUncounted(const clang::QualType T);
+
+/// \returns true if \p Class is ref-countable AND not ref-counted, false if
+/// not, std::nullopt if inconclusive.
 std::optional<bool> isUncounted(const clang::CXXRecordDecl* Class);
 
 /// \returns true if \p T is either a raw pointer or reference to an uncounted
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 704c082..81c2434 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -87,8 +87,7 @@ public:
         }
         auto *E = MemberCallExpr->getImplicitObjectArgument();
         QualType ArgType = MemberCallExpr->getObjectType();
-        std::optional<bool> IsUncounted =
-            isUncounted(ArgType->getAsCXXRecordDecl());
+        std::optional<bool> IsUncounted = isUncounted(ArgType);
         if (IsUncounted && *IsUncounted && !isPtrOriginSafe(E))
           reportBugOnThis(E);
       }
diff --git a/clang/lib/Tooling/Transformer/Stencil.cpp b/clang/lib/Tooling/Transformer/Stencil.cpp
index bc4fa6e..223fb5a 100644
--- a/clang/lib/Tooling/Transformer/Stencil.cpp
+++ b/clang/lib/Tooling/Transformer/Stencil.cpp
@@ -50,7 +50,13 @@ static Error printNode(StringRef Id, const MatchFinder::MatchResult &Match,
   auto NodeOrErr = getNode(Match.Nodes, Id);
   if (auto Err = NodeOrErr.takeError())
     return Err;
-  NodeOrErr->print(Os, PrintingPolicy(Match.Context->getLangOpts()));
+  const PrintingPolicy PP(Match.Context->getLangOpts());
+  if (const auto *ND = NodeOrErr->get<NamedDecl>()) {
+    // For NamedDecls, we can do a better job than printing the whole thing.
+    ND->getNameForDiagnostic(Os, PP, false);
+  } else {
+    NodeOrErr->print(Os, PP);
+  }
   *Result += Output;
   return Error::success();
 }
diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
new file mode 100644
index 0000000..42991d8
--- /dev/null
+++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s
+
+
+// This test tests two different AST generations. The "EMPTY" test mode verifies
+// the AST generated by forward declaration of the HLSL types which happens on
+// initializing the HLSL external AST with an AST Context.
+
+// The non-empty mode has a use that requires the StructuredBuffer type be complete,
+// which results in the AST being populated by the external AST source. That
+// case covers the full implementation of the template declaration and the
+// instantiated specialization.
+
+// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit StructuredBuffer
+// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element_type
+// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class StructuredBuffer
+// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+
+// There should be no more occurrances of StructuredBuffer
+// EMPTY-NOT: StructuredBuffer
+
+#ifndef EMPTY
+
+StructuredBuffer<float> Buffer;
+
+#endif
+
+// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit StructuredBuffer
+// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element_type
+// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class StructuredBuffer definition
+
+// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h 'element_type *'
+// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit UAV
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
+
+// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
+// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const StructuredBuffer<element_type>' lvalue implicit this
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
+// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
+// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'StructuredBuffer<element_type>' lvalue implicit this
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
+// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class StructuredBuffer definition
+
+// CHECK: TemplateArgument type 'float'
+// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
+// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'float *'
+// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit UAV
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index a98c6eb9c..97efb35 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -6,6 +6,7 @@
 void WTFBreakpointTrap();
 void WTFCrashWithInfo(int, const char*, const char*, int);
 void WTFReportAssertionFailure(const char* file, int line, const char* function, const char* assertion);
+void WTFReportBacktrace(void);
 
 void WTFCrash(void);
 void WTFCrashWithSecurityImplication(void);
@@ -334,6 +335,7 @@ public:
   }
   unsigned trivial60() { return ObjectWithNonTrivialDestructor { 5 }.value(); }
   unsigned trivial61() { return DerivedNumber('7').value(); }
+  void trivial62() { WTFReportBacktrace(); }
 
   static RefCounted& singleton() {
     static RefCounted s_RefCounted;
@@ -341,6 +343,12 @@ public:
     return s_RefCounted;
   }
 
+  static RefCounted& otherSingleton() {
+    static RefCounted s_RefCounted;
+    s_RefCounted.ref();
+    return s_RefCounted;
+  }
+
   Number nonTrivial1() { return Number(3) + Number(4); }
   Number nonTrivial2() { return Number { 0.3 }; }
   int nonTrivial3() { return v ? otherFunction() : 0; }
@@ -506,9 +514,12 @@ public:
     getFieldTrivial().trivial59(); // no-warning
     getFieldTrivial().trivial60(); // no-warning
     getFieldTrivial().trivial61(); // no-warning
+    getFieldTrivial().trivial62(); // no-warning
 
     RefCounted::singleton().trivial18(); // no-warning
     RefCounted::singleton().someFunction(); // no-warning
+    RefCounted::otherSingleton().trivial18(); // no-warning
+    RefCounted::otherSingleton().someFunction(); // no-warning
 
     getFieldTrivial().recursiveTrivialFunction(7); // no-warning
     getFieldTrivial().recursiveComplexFunction(9);
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm
new file mode 100644
index 0000000..db0c5b1
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm
@@ -0,0 +1,26 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+#import "mock-types.h"
+#import "mock-system-header.h"
+#import "../../Inputs/system-header-simulator-for-objc-dealloc.h"
+
+@interface Foo : NSObject
+
+@property (nonatomic, readonly) RefPtr<RefCountable> countable;
+
+- (void)execute;
+- (RefPtr<RefCountable>)_protectedRefCountable;
+@end
+
+@implementation Foo
+
+- (void)execute {
+  self._protectedRefCountable->method();
+}
+
+- (RefPtr<RefCountable>)_protectedRefCountable {
+  return _countable;
+}
+
+@end
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl
new file mode 100644
index 0000000..16b7295
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+
+StructuredBuffer<float> Buffer1;
+StructuredBuffer<vector<float, 4> > BufferArray[4];
+
+StructuredBuffer<float> Buffer2 : register(u3);
+StructuredBuffer<vector<float, 4> > BufferArray2[4] : register(u4);
+
+StructuredBuffer<float> Buffer3 : register(u3, space1);
+StructuredBuffer<vector<float, 4> > BufferArray3[4] : register(u4, space1);
+
+[numthreads(1,1,1)]
+void main() {
+}
+
+// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]}
+// CHECK-DAG: ![[Single]] = !{ptr @"?Buffer1@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[Array]] = !{ptr @"?BufferArray@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[SingleAllocated]] = !{ptr @"?Buffer2@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 0}
+// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @"?BufferArray2@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 0}
+// CHECK-DAG: ![[SingleSpace]] = !{ptr @"?Buffer3@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 1}
+// CHECK-DAG: ![[ArraySpace]] = !{ptr @"?BufferArray3@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 1}
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
new file mode 100644
index 0000000..34019e5
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
+
+StructuredBuffer<float> Buf;
+
+// CHECK: define linkonce_odr noundef ptr @"??0?$StructuredBuffer@M@hlsl@@QAA@XZ"
+// CHECK-NEXT: entry:
+
+// CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1)
+// CHECK: store ptr %[[HandleRes]], ptr %h, align 4
+
+// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1)
+// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
new file mode 100644
index 0000000..8ddf8a6
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s
+
+StructuredBuffer<int16_t> BufI16;
+StructuredBuffer<uint16_t> BufU16;
+StructuredBuffer<int> BufI32;
+StructuredBuffer<uint> BufU32;
+StructuredBuffer<int64_t> BufI64;
+StructuredBuffer<uint64_t> BufU64;
+StructuredBuffer<half> BufF16;
+StructuredBuffer<float> BufF32;
+StructuredBuffer<double> BufF64;
+StructuredBuffer< vector<int16_t, 4> > BufI16x4;
+StructuredBuffer< vector<uint, 3> > BufU32x3;
+StructuredBuffer<half2> BufF16x2;
+StructuredBuffer<float3> BufF32x3;
+// TODO: StructuredBuffer<snorm half> BufSNormF16; -> 11
+// TODO: StructuredBuffer<unorm half> BufUNormF16; -> 12
+// TODO: StructuredBuffer<snorm float> BufSNormF32; -> 13
+// TODO: StructuredBuffer<unorm float> BufUNormF32; -> 14
+// TODO: StructuredBuffer<snorm double> BufSNormF64; -> 15
+// TODO: StructuredBuffer<unorm double> BufUNormF64; -> 16
+
+[numthreads(1,1,1)]
+void main(int GI : SV_GroupIndex) {
+  BufI16[GI] = 0;
+  BufU16[GI] = 0;
+  BufI32[GI] = 0;
+  BufU32[GI] = 0;
+  BufI64[GI] = 0;
+  BufU64[GI] = 0;
+  BufF16[GI] = 0;
+  BufF32[GI] = 0;
+  BufF64[GI] = 0;
+  BufI16x4[GI] = 0;
+  BufU32x3[GI] = 0;
+  BufF16x2[GI] = 0;
+  BufF32x3[GI] = 0;
+}
+
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16@@3V?$StructuredBuffer@F@hlsl@@A", i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU16@@3V?$StructuredBuffer@G@hlsl@@A", i32 10, i32 3,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI32@@3V?$StructuredBuffer@H@hlsl@@A", i32 10, i32 4,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32@@3V?$StructuredBuffer@I@hlsl@@A", i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI64@@3V?$StructuredBuffer@J@hlsl@@A", i32 10, i32 6,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU64@@3V?$StructuredBuffer@K@hlsl@@A", i32 10, i32 7,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16@@3V?$StructuredBuffer@$f16@@hlsl@@A", i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF64@@3V?$StructuredBuffer@N@hlsl@@A", i32 10, i32 10,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16x4@@3V?$StructuredBuffer@T?$__vector@F$03@__clang@@@hlsl@@A", i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32x3@@3V?$StructuredBuffer@T?$__vector@I$02@__clang@@@hlsl@@A", i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16x2@@3V?$StructuredBuffer@T?$__vector@$f16@$01@__clang@@@hlsl@@A", i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32x3@@3V?$StructuredBuffer@T?$__vector@M$02@__clang@@@hlsl@@A", i32 10, i32 9,
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl
new file mode 100644
index 0000000..9bd885d
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s
+
+StructuredBuffer<int> In;
+StructuredBuffer<int> Out;
+
+[numthreads(1,1,1)]
+void main(unsigned GI : SV_GroupIndex) {
+  Out[GI] = In[GI];
+}
+
+// Even at -O0 the subscript operators get inlined. The -O0 IR is a bit messy
+// and confusing to follow so the match here is pretty weak.
+
+// CHECK: define internal void @"?main@@YAXI@Z"
+// CHECK-NOT: call
+// CHECK: ret void
diff --git a/clang/test/CodeGenHLSL/loops/unroll.hlsl b/clang/test/CodeGenHLSL/loops/unroll.hlsl
index 7389f21..efca074 100644
--- a/clang/test/CodeGenHLSL/loops/unroll.hlsl
+++ b/clang/test/CodeGenHLSL/loops/unroll.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN: dxil-pc-shadermodel6.3-library -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
 
 /*** for ***/
@@ -35,8 +35,8 @@ void for_nested_one_unroll_enable()
         for( int j = 0; j < 10; ++j)
             s += i + j;
     }
-// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE:.*]]
-// CHECK-NOT: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_1_ENABLE:.*]]
+// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE_INNER:.*]]
+// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE_OUTER:.*]]
 }
 
 void for_nested_two_unroll_enable()
@@ -111,20 +111,26 @@ void do_enable()
 }
 
 
-// CHECK: ![[FOR_DISTINCT]]     =  distinct !{![[FOR_DISTINCT]],  ![[FOR_COUNT:.*]]}
-// CHECK: ![[FOR_COUNT]]         =  !{!"llvm.loop.unroll.count", i32 8}
-// CHECK: ![[FOR_DISABLE]]   =  distinct !{![[FOR_DISABLE]],  ![[DISABLE:.*]]}
-// CHECK: ![[DISABLE]]       =  !{!"llvm.loop.unroll.disable"}
-// CHECK: ![[FOR_ENABLE]]      =  distinct !{![[FOR_ENABLE]],  ![[ENABLE:.*]]}
-// CHECK: ![[ENABLE]]          =  !{!"llvm.loop.unroll.enable"}
-// CHECK: ![[FOR_NESTED_ENABLE]] =  distinct !{![[FOR_NESTED_ENABLE]], ![[ENABLE]]}
-// CHECK: ![[FOR_NESTED2_ENABLE]] =  distinct !{![[FOR_NESTED2_ENABLE]], ![[ENABLE]]}
-// CHECK: ![[FOR_NESTED2_1_ENABLE]] =  distinct !{![[FOR_NESTED2_1_ENABLE]], ![[ENABLE]]}
-// CHECK: ![[WHILE_DISTINCT]]   =  distinct !{![[WHILE_DISTINCT]],    ![[WHILE_COUNT:.*]]}
-// CHECK: ![[WHILE_COUNT]]         =  !{!"llvm.loop.unroll.count", i32 4}
-// CHECK: ![[WHILE_DISABLE]] =  distinct !{![[WHILE_DISABLE]],  ![[DISABLE]]}
-// CHECK: ![[WHILE_ENABLE]]    =  distinct !{![[WHILE_ENABLE]],     ![[ENABLE]]}
-// CHECK: ![[DO_DISTINCT]]      =  distinct !{![[DO_DISTINCT]],       ![[DO_COUNT:.*]]}
-// CHECK: ![[DO_COUNT]]         =  !{!"llvm.loop.unroll.count", i32 16}
-// CHECK: ![[DO_DISABLE]]    =  distinct !{![[DO_DISABLE]],     ![[DISABLE]]}
-// CHECK: ![[DO_ENABLE]]       =  distinct !{![[DO_ENABLE]],        ![[ENABLE]]}
+// CHECK-DAG: [[MUST_PROGRESS:.*]] = !{!"llvm.loop.mustprogress"}
+// CHECK-DAG: [[DISABLE:.*]] = !{!"llvm.loop.unroll.disable"}
+// CHECK-DAG: [[FOR_COUNT:.*]] =  !{!"llvm.loop.unroll.count", i32 8}
+// CHECK-DAG: [[ENABLE:.*]] = !{!"llvm.loop.unroll.enable"}
+// CHECK-DAG: [[WHILE_COUNT:.*]] = !{!"llvm.loop.unroll.count", i32 4}
+// CHECK-DAG: [[DO_COUNT:.*]] = !{!"llvm.loop.unroll.count", i32 16}
+
+// CHECK-DAG: ![[FOR_DISTINCT]] =  distinct !{![[FOR_DISTINCT]], [[MUST_PROGRESS]], [[FOR_COUNT]]}
+// CHECK-DAG: ![[FOR_DISABLE]] =  distinct !{![[FOR_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]}
+// CHECK-DAG: ![[FOR_ENABLE]] =  distinct !{![[FOR_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]}
+
+// CHECK-DAG: ![[FOR_NESTED_ENABLE_INNER]] = distinct !{![[FOR_NESTED_ENABLE_INNER]], [[MUST_PROGRESS]]}
+// CHECK-DAG: ![[FOR_NESTED_ENABLE_OUTER]] = distinct !{![[FOR_NESTED_ENABLE_OUTER]], [[MUST_PROGRESS]], [[ENABLE]]}
+// CHECK-DAG: ![[FOR_NESTED2_ENABLE]] =  distinct !{![[FOR_NESTED2_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]}
+// CHECK-DAG: ![[FOR_NESTED2_1_ENABLE]] =  distinct !{![[FOR_NESTED2_1_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]}
+// CHECK-DAG: ![[WHILE_DISTINCT]]   =  distinct !{![[WHILE_DISTINCT]], [[MUST_PROGRESS]], [[WHILE_COUNT]]}
+
+// CHECK-DAG: ![[WHILE_DISABLE]] =  distinct !{![[WHILE_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]}
+// CHECK-DAG: ![[WHILE_ENABLE]] =  distinct !{![[WHILE_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]}
+// CHECK-DAG: ![[DO_DISTINCT]]  =  distinct !{![[DO_DISTINCT]], [[MUST_PROGRESS]], [[DO_COUNT]]}
+
+// CHECK-DAG: ![[DO_DISABLE]] =  distinct !{![[DO_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]}
+// CHECK-DAG: ![[DO_ENABLE]] =  distinct !{![[DO_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]}
diff --git a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
new file mode 100644
index 0000000..2450941
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s
+
+typedef vector<float, 3> float3;
+
+StructuredBuffer<float3> Buffer;
+
+// expected-error@+2 {{class template 'StructuredBuffer' requires template arguments}}
+// expected-note@*:* {{template declaration from hidden source: template <class element_type> class StructuredBuffer}}
+StructuredBuffer BufferErr1;
+
+// expected-error@+2 {{too few template arguments for class template 'StructuredBuffer'}}
+// expected-note@*:* {{template declaration from hidden source: template <class element_type> class StructuredBuffer}}
+StructuredBuffer<> BufferErr2;
+
+[numthreads(1,1,1)]
+void main() {
+  (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::StructuredBuffer<vector<float, 3> >'}}
+  // expected-note@* {{implicitly declared private here}}
+}
diff --git a/clang/unittests/Tooling/StencilTest.cpp b/clang/unittests/Tooling/StencilTest.cpp
index 26257cf..445912a 100644
--- a/clang/unittests/Tooling/StencilTest.cpp
+++ b/clang/unittests/Tooling/StencilTest.cpp
@@ -565,6 +565,28 @@ TEST_F(StencilTest, DescribeAnonNamespaceType) {
                        HasValue(std::string(Expected)));
 }
 
+TEST_F(StencilTest, DescribeFunction) {
+  std::string Snippet = "int F(); F();";
+  std::string Expected = "F";
+  auto StmtMatch = matchStmt(Snippet, callExpr(callee(namedDecl().bind("fn"))));
+  ASSERT_TRUE(StmtMatch);
+  EXPECT_THAT_EXPECTED(describe("fn")->eval(StmtMatch->Result),
+                       HasValue(std::string(Expected)));
+}
+
+TEST_F(StencilTest, DescribeImplicitOperator) {
+  std::string Snippet = "struct Tag {}; [](Tag){};";
+  std::string Expected = "operator()";
+  auto StmtMatch = matchStmt(
+      Snippet,
+      stmt(hasDescendant(
+          cxxMethodDecl(hasParameter(0, hasType(namedDecl(hasName("Tag")))))
+              .bind("fn"))));
+  ASSERT_TRUE(StmtMatch);
+  EXPECT_THAT_EXPECTED(describe("fn")->eval(StmtMatch->Result),
+                       HasValue(std::string(Expected)));
+}
+
 TEST_F(StencilTest, RunOp) {
   StringRef Id = "id";
   auto SimpleFn = [Id](const MatchResult &R) {
diff --git a/clang/utils/TableGen/ASTTableGen.cpp b/clang/utils/TableGen/ASTTableGen.cpp
index 54288ff..4734477 100644
--- a/clang/utils/TableGen/ASTTableGen.cpp
+++ b/clang/utils/TableGen/ASTTableGen.cpp
@@ -31,7 +31,8 @@ llvm::StringRef clang::tblgen::HasProperties::getName() const {
   }
 }
 
-static StringRef removeExpectedNodeNameSuffix(Record *node, StringRef suffix) {
+static StringRef removeExpectedNodeNameSuffix(const Record *node,
+                                              StringRef suffix) {
   StringRef nodeName = node->getName();
   if (!nodeName.ends_with(suffix)) {
     PrintFatalError(node->getLoc(),
@@ -105,8 +106,7 @@ static void visitASTNodeRecursive(ASTNode node, ASTNode base,
   }
 }
 
-static void visitHierarchy(RecordKeeper &records,
-                           StringRef nodeClassName,
+static void visitHierarchy(const RecordKeeper &records, StringRef nodeClassName,
                            ASTNodeHierarchyVisitor<ASTNode> visit) {
   // Check for the node class, just as a basic correctness check.
   if (!records.getClass(nodeClassName)) {
@@ -114,13 +114,10 @@ static void visitHierarchy(RecordKeeper &records,
                       + nodeClassName);
   }
 
-  // Find all the nodes in the hierarchy.
-  auto nodes = records.getAllDerivedDefinitions(nodeClassName);
-
-  // Derive the child map.
+  // Derive the child map for all nodes in the hierarchy.
   ChildMap hierarchy;
   ASTNode root;
-  for (ASTNode node : nodes) {
+  for (ASTNode node : records.getAllDerivedDefinitions(nodeClassName)) {
     if (auto base = node.getBase())
       hierarchy.insert(std::make_pair(base, node));
     else if (root)
@@ -136,8 +133,8 @@ static void visitHierarchy(RecordKeeper &records,
   visitASTNodeRecursive(root, ASTNode(), hierarchy, visit);
 }
 
-void clang::tblgen::visitASTNodeHierarchyImpl(RecordKeeper &records,
-                                              StringRef nodeClassName,
-                                      ASTNodeHierarchyVisitor<ASTNode> visit) {
+void clang::tblgen::visitASTNodeHierarchyImpl(
+    const RecordKeeper &records, StringRef nodeClassName,
+    ASTNodeHierarchyVisitor<ASTNode> visit) {
   visitHierarchy(records, nodeClassName, visit);
 }
diff --git a/clang/utils/TableGen/ASTTableGen.h b/clang/utils/TableGen/ASTTableGen.h
index 41f78a6..143d779 100644
--- a/clang/utils/TableGen/ASTTableGen.h
+++ b/clang/utils/TableGen/ASTTableGen.h
@@ -87,18 +87,18 @@ namespace clang {
 namespace tblgen {
 
 class WrappedRecord {
-  llvm::Record *Record;
+  const llvm::Record *Record;
 
 protected:
-  WrappedRecord(llvm::Record *record = nullptr) : Record(record) {}
+  WrappedRecord(const llvm::Record *record = nullptr) : Record(record) {}
 
-  llvm::Record *get() const {
+  const llvm::Record *get() const {
     assert(Record && "accessing null record");
     return Record;
   }
 
 public:
-  llvm::Record *getRecord() const { return Record; }
+  const llvm::Record *getRecord() const { return Record; }
 
   explicit operator bool() const { return Record != nullptr; }
 
@@ -144,7 +144,7 @@ class HasProperties : public WrappedRecord {
 public:
   static constexpr llvm::StringRef ClassName = HasPropertiesClassName;
 
-  HasProperties(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+  HasProperties(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
 
   llvm::StringRef getName() const;
 
@@ -157,7 +157,7 @@ public:
 /// in one of Clang's AST hierarchies.
 class ASTNode : public HasProperties {
 public:
-  ASTNode(llvm::Record *record = nullptr) : HasProperties(record) {}
+  ASTNode(const llvm::Record *record = nullptr) : HasProperties(record) {}
 
   llvm::StringRef getName() const {
     return get()->getName();
@@ -180,7 +180,7 @@ public:
 
 class DeclNode : public ASTNode {
 public:
-  DeclNode(llvm::Record *record = nullptr) : ASTNode(record) {}
+  DeclNode(const llvm::Record *record = nullptr) : ASTNode(record) {}
 
   llvm::StringRef getId() const;
   std::string getClassName() const;
@@ -202,7 +202,7 @@ public:
 
 class TypeNode : public ASTNode {
 public:
-  TypeNode(llvm::Record *record = nullptr) : ASTNode(record) {}
+  TypeNode(const llvm::Record *record = nullptr) : ASTNode(record) {}
 
   llvm::StringRef getId() const;
   llvm::StringRef getClassName() const;
@@ -224,7 +224,7 @@ public:
 
 class StmtNode : public ASTNode {
 public:
-  StmtNode(llvm::Record *record = nullptr) : ASTNode(record) {}
+  StmtNode(const llvm::Record *record = nullptr) : ASTNode(record) {}
 
   std::string getId() const;
   llvm::StringRef getClassName() const;
@@ -247,7 +247,7 @@ public:
 /// The type of a property.
 class PropertyType : public WrappedRecord {
 public:
-  PropertyType(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+  PropertyType(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
 
   /// Is this a generic specialization (i.e. `Array<T>` or `Optional<T>`)?
   bool isGenericSpecialization() const {
@@ -331,7 +331,7 @@ public:
 /// A rule for returning the kind of a type.
 class TypeKindRule : public WrappedRecord {
 public:
-  TypeKindRule(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+  TypeKindRule(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
 
   /// Return the type to which this applies.
   PropertyType getParentType() const {
@@ -361,7 +361,7 @@ public:
 /// An implementation case of a property type.
 class TypeCase : public HasProperties {
 public:
-  TypeCase(llvm::Record *record = nullptr) : HasProperties(record) {}
+  TypeCase(const llvm::Record *record = nullptr) : HasProperties(record) {}
 
   /// Return the name of this case.
   llvm::StringRef getCaseName() const {
@@ -381,7 +381,7 @@ public:
 /// A property of an AST node.
 class Property : public WrappedRecord {
 public:
-  Property(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+  Property(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
 
   /// Return the name of this property.
   llvm::StringRef getName() const {
@@ -417,7 +417,8 @@ public:
 /// a value (which is actually done when writing the value out).
 class ReadHelperRule : public WrappedRecord {
 public:
-  ReadHelperRule(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+  ReadHelperRule(const llvm::Record *record = nullptr)
+      : WrappedRecord(record) {}
 
   /// Return the class for which this is a creation rule.
   /// Should never be abstract.
@@ -437,7 +438,7 @@ public:
 /// A rule for how to create an AST node from its properties.
 class CreationRule : public WrappedRecord {
 public:
-  CreationRule(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+  CreationRule(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
 
   /// Return the class for which this is a creation rule.
   /// Should never be abstract.
@@ -457,7 +458,7 @@ public:
 /// A rule which overrides the standard rules for serializing an AST node.
 class OverrideRule : public WrappedRecord {
 public:
-  OverrideRule(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+  OverrideRule(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
 
   /// Return the class for which this is an override rule.
   /// Should never be abstract.
@@ -483,12 +484,12 @@ template <class NodeClass>
 using ASTNodeHierarchyVisitor =
   llvm::function_ref<void(NodeClass node, NodeClass base)>;
 
-void visitASTNodeHierarchyImpl(llvm::RecordKeeper &records,
+void visitASTNodeHierarchyImpl(const llvm::RecordKeeper &records,
                                llvm::StringRef nodeClassName,
                                ASTNodeHierarchyVisitor<ASTNode> visit);
 
 template <class NodeClass>
-void visitASTNodeHierarchy(llvm::RecordKeeper &records,
+void visitASTNodeHierarchy(const llvm::RecordKeeper &records,
                            ASTNodeHierarchyVisitor<NodeClass> visit) {
   visitASTNodeHierarchyImpl(records, NodeClass::getTableGenNodeClassName(),
                             [visit](ASTNode node, ASTNode base) {
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
index 94f12a0..4ae7600 100644
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -345,7 +345,7 @@ void EmitBuiltin(llvm::raw_ostream &OS, const Record *Builtin) {
 }
 } // namespace
 
-void clang::EmitClangBuiltins(llvm::RecordKeeper &Records,
+void clang::EmitClangBuiltins(const llvm::RecordKeeper &Records,
                               llvm::raw_ostream &OS) {
   emitSourceFileHeader("List of builtins that Clang recognizes", OS);
 
diff --git a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
index aee7d38..1a2503d 100644
--- a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
+++ b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
@@ -20,16 +20,16 @@
 
 using namespace llvm;
 
-void clang::EmitClangCommentCommandInfo(RecordKeeper &Records,
+void clang::EmitClangCommentCommandInfo(const RecordKeeper &Records,
                                         raw_ostream &OS) {
   emitSourceFileHeader("A list of commands useable in documentation comments",
                        OS, Records);
 
   OS << "namespace {\n"
         "const CommandInfo Commands[] = {\n";
-  std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Command");
+  ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Command");
   for (size_t i = 0, e = Tags.size(); i != e; ++i) {
-    Record &Tag = *Tags[i];
+    const Record &Tag = *Tags[i];
     OS << "  { "
        << "\"" << Tag.getValueAsString("Name") << "\", "
        << "\"" << Tag.getValueAsString("EndCommandName") << "\", " << i << ", "
@@ -62,7 +62,7 @@ void clang::EmitClangCommentCommandInfo(RecordKeeper &Records,
 
   std::vector<StringMatcher::StringPair> Matches;
   for (size_t i = 0, e = Tags.size(); i != e; ++i) {
-    Record &Tag = *Tags[i];
+    const Record &Tag = *Tags[i];
     std::string Name = std::string(Tag.getValueAsString("Name"));
     std::string Return;
     raw_string_ostream(Return) << "return &Commands[" << i << "];";
@@ -112,7 +112,7 @@ static std::string MangleName(StringRef Str) {
   return Mangled;
 }
 
-void clang::EmitClangCommentCommandList(RecordKeeper &Records,
+void clang::EmitClangCommentCommandList(const RecordKeeper &Records,
                                         raw_ostream &OS) {
   emitSourceFileHeader("A list of commands useable in documentation comments",
                        OS, Records);
@@ -121,9 +121,9 @@ void clang::EmitClangCommentCommandList(RecordKeeper &Records,
      << "#  define COMMENT_COMMAND(NAME)\n"
      << "#endif\n";
 
-  std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Command");
+  ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Command");
   for (size_t i = 0, e = Tags.size(); i != e; ++i) {
-    Record &Tag = *Tags[i];
+    const Record &Tag = *Tags[i];
     std::string MangledName = MangleName(Tag.getValueAsString("Name"));
 
     OS << "COMMENT_COMMAND(" << MangledName << ")\n";
diff --git a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
index f1cd9af..bd75b3f 100644
--- a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
+++ b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
@@ -46,21 +46,17 @@ static bool translateCodePointToUTF8(unsigned CodePoint,
   return true;
 }
 
-void clang::EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records,
-                                                         raw_ostream &OS) {
-  std::vector<Record *> Tags = Records.getAllDerivedDefinitions("NCR");
+void clang::EmitClangCommentHTMLNamedCharacterReferences(
+    const RecordKeeper &Records, raw_ostream &OS) {
   std::vector<StringMatcher::StringPair> NameToUTF8;
   SmallString<32> CLiteral;
-  for (std::vector<Record *>::iterator I = Tags.begin(), E = Tags.end();
-       I != E; ++I) {
-    Record &Tag = **I;
-    std::string Spelling = std::string(Tag.getValueAsString("Spelling"));
-    uint64_t CodePoint = Tag.getValueAsInt("CodePoint");
+  for (const Record *Tag : Records.getAllDerivedDefinitions("NCR")) {
+    std::string Spelling = std::string(Tag->getValueAsString("Spelling"));
+    uint64_t CodePoint = Tag->getValueAsInt("CodePoint");
     CLiteral.clear();
     CLiteral.append("return ");
     if (!translateCodePointToUTF8(CodePoint, CLiteral)) {
-      SrcMgr.PrintMessage(Tag.getLoc().front(),
-                          SourceMgr::DK_Error,
+      SrcMgr.PrintMessage(Tag->getLoc().front(), SourceMgr::DK_Error,
                           Twine("invalid code point"));
       continue;
     }
diff --git a/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp
index 3dc1098..a457315 100644
--- a/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp
+++ b/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp
@@ -19,10 +19,11 @@
 
 using namespace llvm;
 
-void clang::EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS) {
-  std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Tag");
+void clang::EmitClangCommentHTMLTags(const RecordKeeper &Records,
+                                     raw_ostream &OS) {
+  ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Tag");
   std::vector<StringMatcher::StringPair> Matches;
-  for (Record *Tag : Tags) {
+  for (const Record *Tag : Tags) {
     Matches.emplace_back(std::string(Tag->getValueAsString("Spelling")),
                          "return true;");
   }
@@ -35,12 +36,12 @@ void clang::EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS) {
      << "}\n\n";
 }
 
-void clang::EmitClangCommentHTMLTagsProperties(RecordKeeper &Records,
+void clang::EmitClangCommentHTMLTagsProperties(const RecordKeeper &Records,
                                                raw_ostream &OS) {
-  std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Tag");
+  ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Tag");
   std::vector<StringMatcher::StringPair> MatchesEndTagOptional;
   std::vector<StringMatcher::StringPair> MatchesEndTagForbidden;
-  for (Record *Tag : Tags) {
+  for (const Record *Tag : Tags) {
     std::string Spelling = std::string(Tag->getValueAsString("Spelling"));
     StringMatcher::StringPair Match(Spelling, "return true;");
     if (Tag->getValueAsBit("EndTagOptional"))
diff --git a/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp b/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp
index 4508293..dae6710 100644
--- a/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp
@@ -4,7 +4,7 @@
 
 using namespace llvm;
 
-void clang::EmitClangDataCollectors(RecordKeeper &RK, raw_ostream &OS) {
+void clang::EmitClangDataCollectors(const RecordKeeper &RK, raw_ostream &OS) {
   const auto &Defs = RK.getClasses();
   for (const auto &Entry : Defs) {
     Record &R = *Entry.second;
diff --git a/clang/utils/TableGen/ClangOpcodesEmitter.cpp b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
index 120e1e2..7e426d5 100644
--- a/clang/utils/TableGen/ClangOpcodesEmitter.cpp
+++ b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
@@ -20,11 +20,11 @@ using namespace llvm;
 
 namespace {
 class ClangOpcodesEmitter {
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
   unsigned NumTypes;
 
 public:
-  ClangOpcodesEmitter(RecordKeeper &R)
+  ClangOpcodesEmitter(const RecordKeeper &R)
       : Records(R), NumTypes(Records.getAllDerivedDefinitions("Type").size()) {}
 
   void run(raw_ostream &OS);
@@ -404,6 +404,6 @@ void ClangOpcodesEmitter::PrintTypes(raw_ostream &OS,
   OS << ">";
 }
 
-void clang::EmitClangOpcodes(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitClangOpcodes(const RecordKeeper &Records, raw_ostream &OS) {
   ClangOpcodesEmitter(Records).run(OS);
 }
diff --git a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
index 74c3a85..d68dcc4 100644
--- a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
+++ b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
@@ -87,7 +87,7 @@ struct BuiltinTableEntries {
 //
 class BuiltinNameEmitter {
 public:
-  BuiltinNameEmitter(RecordKeeper &Records, raw_ostream &OS)
+  BuiltinNameEmitter(const RecordKeeper &Records, raw_ostream &OS)
       : Records(Records), OS(OS) {}
 
   // Entrypoint to generate the functions and structures for checking
@@ -100,7 +100,7 @@ private:
 
   // Contains OpenCL builtin functions and related information, stored as
   // Record instances. They are coming from the associated TableGen file.
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
 
   // The output file.
   raw_ostream &OS;
@@ -113,7 +113,7 @@ private:
   // \param Output (out) String containing the enums to emit in the output file.
   // \param List (out) List containing the extracted Types, except the Types in
   //        TypesSeen.
-  void ExtractEnumTypes(std::vector<Record *> &Types,
+  void ExtractEnumTypes(ArrayRef<const Record *> Types,
                         StringMap<bool> &TypesSeen, std::string &Output,
                         std::vector<const Record *> &List);
 
@@ -237,7 +237,7 @@ private:
 /// Base class for emitting a file (e.g. header or test) from OpenCLBuiltins.td
 class OpenCLBuiltinFileEmitterBase {
 public:
-  OpenCLBuiltinFileEmitterBase(RecordKeeper &Records, raw_ostream &OS)
+  OpenCLBuiltinFileEmitterBase(const RecordKeeper &Records, raw_ostream &OS)
       : Records(Records), OS(OS) {}
   virtual ~OpenCLBuiltinFileEmitterBase() = default;
 
@@ -305,7 +305,7 @@ protected:
 
   // Contains OpenCL builtin functions and related information, stored as
   // Record instances. They are coming from the associated TableGen file.
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
 
   // The output file.
   raw_ostream &OS;
@@ -316,7 +316,7 @@ protected:
 // builtin function described in the .td input.
 class OpenCLBuiltinTestEmitter : public OpenCLBuiltinFileEmitterBase {
 public:
-  OpenCLBuiltinTestEmitter(RecordKeeper &Records, raw_ostream &OS)
+  OpenCLBuiltinTestEmitter(const RecordKeeper &Records, raw_ostream &OS)
       : OpenCLBuiltinFileEmitterBase(Records, OS) {}
 
   // Entrypoint to generate the functions for testing all OpenCL builtin
@@ -329,7 +329,7 @@ public:
 // prototype for each builtin function described in the .td input.
 class OpenCLBuiltinHeaderEmitter : public OpenCLBuiltinFileEmitterBase {
 public:
-  OpenCLBuiltinHeaderEmitter(RecordKeeper &Records, raw_ostream &OS)
+  OpenCLBuiltinHeaderEmitter(const RecordKeeper &Records, raw_ostream &OS)
       : OpenCLBuiltinFileEmitterBase(Records, OS) {}
 
   // Entrypoint to generate the header.
@@ -362,7 +362,7 @@ void BuiltinNameEmitter::Emit() {
   EmitQualTypeFinder();
 }
 
-void BuiltinNameEmitter::ExtractEnumTypes(std::vector<Record *> &Types,
+void BuiltinNameEmitter::ExtractEnumTypes(ArrayRef<const Record *> Types,
                                           StringMap<bool> &TypesSeen,
                                           std::string &Output,
                                           std::vector<const Record *> &List) {
@@ -392,11 +392,11 @@ void BuiltinNameEmitter::EmitDeclarations() {
   // Extract generic types and non-generic types separately, to keep
   // gentypes at the end of the enum which simplifies the special handling
   // for gentypes in SemaLookup.
-  std::vector<Record *> GenTypes =
+  ArrayRef<const Record *> GenTypes =
       Records.getAllDerivedDefinitions("GenericType");
   ExtractEnumTypes(GenTypes, TypesSeen, GenTypeEnums, GenTypeList);
 
-  std::vector<Record *> Types = Records.getAllDerivedDefinitions("Type");
+  ArrayRef<const Record *> Types = Records.getAllDerivedDefinitions("Type");
   ExtractEnumTypes(Types, TypesSeen, TypeEnums, TypeList);
 
   OS << TypeEnums;
@@ -499,7 +499,7 @@ static void VerifySignature(const std::vector<Record *> &Signature,
 
 void BuiltinNameEmitter::GetOverloads() {
   // Populate the TypeMap.
-  std::vector<Record *> Types = Records.getAllDerivedDefinitions("Type");
+  ArrayRef<const Record *> Types = Records.getAllDerivedDefinitions("Type");
   unsigned I = 0;
   for (const auto &T : Types) {
     TypeMap.insert(std::make_pair(T, I++));
@@ -507,7 +507,8 @@ void BuiltinNameEmitter::GetOverloads() {
 
   // Populate the SignaturesList and the FctOverloadMap.
   unsigned CumulativeSignIndex = 0;
-  std::vector<Record *> Builtins = Records.getAllDerivedDefinitions("Builtin");
+  ArrayRef<const Record *> Builtins =
+      Records.getAllDerivedDefinitions("Builtin");
   for (const auto *B : Builtins) {
     StringRef BName = B->getValueAsString("Name");
     FctOverloadMap.try_emplace(BName);
@@ -535,7 +536,7 @@ void BuiltinNameEmitter::GetOverloads() {
 void BuiltinNameEmitter::EmitExtensionTable() {
   OS << "static const char *FunctionExtensionTable[] = {\n";
   unsigned Index = 0;
-  std::vector<Record *> FuncExtensions =
+  ArrayRef<const Record *> FuncExtensions =
       Records.getAllDerivedDefinitions("FunctionExtension");
 
   for (const auto &FE : FuncExtensions) {
@@ -804,11 +805,11 @@ static void OCL2Qual(Sema &S, const OpenCLTypeStruct &Ty,
   OS << "\n  switch (Ty.ID) {\n";
 
   // Switch cases for image types (Image2d, Image3d, ...)
-  std::vector<Record *> ImageTypes =
+  ArrayRef<const Record *> ImageTypes =
       Records.getAllDerivedDefinitions("ImageType");
 
   // Map an image type name to its 3 access-qualified types (RO, WO, RW).
-  StringMap<SmallVector<Record *, 3>> ImageTypesMap;
+  StringMap<SmallVector<const Record *, 3>> ImageTypesMap;
   for (auto *IT : ImageTypes)
     ImageTypesMap[IT->getValueAsString("Name")].push_back(IT);
 
@@ -890,7 +891,7 @@ static void OCL2Qual(Sema &S, const OpenCLTypeStruct &Ty,
   // Switch cases for non generic, non image types (int, int4, float, ...).
   // Only insert the plain scalar type; vector information and type qualifiers
   // are added in step 2.
-  std::vector<Record *> Types = Records.getAllDerivedDefinitions("Type");
+  ArrayRef<const Record *> Types = Records.getAllDerivedDefinitions("Type");
   StringMap<bool> TypesSeen;
 
   for (const auto *T : Types) {
@@ -1211,7 +1212,8 @@ void OpenCLBuiltinTestEmitter::emit() {
   unsigned TestID = 0;
 
   // Iterate over all builtins.
-  std::vector<Record *> Builtins = Records.getAllDerivedDefinitions("Builtin");
+  ArrayRef<const Record *> Builtins =
+      Records.getAllDerivedDefinitions("Builtin");
   for (const auto *B : Builtins) {
     StringRef Name = B->getValueAsString("Name");
 
@@ -1274,7 +1276,8 @@ void OpenCLBuiltinHeaderEmitter::emit() {
 )";
 
   // Iterate over all builtins; sort to follow order of definition in .td file.
-  std::vector<Record *> Builtins = Records.getAllDerivedDefinitions("Builtin");
+  std::vector<const Record *> Builtins =
+      Records.getAllDerivedDefinitions("Builtin");
   llvm::sort(Builtins, LessRecord());
 
   for (const auto *B : Builtins) {
@@ -1319,18 +1322,19 @@ void OpenCLBuiltinHeaderEmitter::emit() {
         "#pragma OPENCL EXTENSION all : disable\n";
 }
 
-void clang::EmitClangOpenCLBuiltins(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitClangOpenCLBuiltins(const RecordKeeper &Records,
+                                    raw_ostream &OS) {
   BuiltinNameEmitter NameChecker(Records, OS);
   NameChecker.Emit();
 }
 
-void clang::EmitClangOpenCLBuiltinHeader(RecordKeeper &Records,
+void clang::EmitClangOpenCLBuiltinHeader(const RecordKeeper &Records,
                                          raw_ostream &OS) {
   OpenCLBuiltinHeaderEmitter HeaderFileGenerator(Records, OS);
   HeaderFileGenerator.emit();
 }
 
-void clang::EmitClangOpenCLBuiltinTests(RecordKeeper &Records,
+void clang::EmitClangOpenCLBuiltinTests(const RecordKeeper &Records,
                                         raw_ostream &OS) {
   OpenCLBuiltinTestEmitter TestFileGenerator(Records, OS);
   TestFileGenerator.emit();
diff --git a/clang/utils/TableGen/ClangOptionDocEmitter.cpp b/clang/utils/TableGen/ClangOptionDocEmitter.cpp
index 8683561..8c32f02 100644
--- a/clang/utils/TableGen/ClangOptionDocEmitter.cpp
+++ b/clang/utils/TableGen/ClangOptionDocEmitter.cpp
@@ -24,8 +24,8 @@ using namespace llvm;
 
 namespace {
 struct DocumentedOption {
-  Record *Option;
-  std::vector<Record*> Aliases;
+  const Record *Option;
+  std::vector<const Record *> Aliases;
 };
 struct DocumentedGroup;
 struct Documentation {
@@ -37,7 +37,7 @@ struct Documentation {
   }
 };
 struct DocumentedGroup : Documentation {
-  Record *Group;
+  const Record *Group;
 };
 
 static bool hasFlag(const Record *Option, StringRef OptionFlag,
@@ -63,25 +63,25 @@ static bool isOptionVisible(const Record *Option, const Record *DocInfo) {
 }
 
 // Reorganize the records into a suitable form for emitting documentation.
-Documentation extractDocumentation(RecordKeeper &Records,
+Documentation extractDocumentation(const RecordKeeper &Records,
                                    const Record *DocInfo) {
   Documentation Result;
 
   // Build the tree of groups. The root in the tree is the fake option group
   // (Record*)nullptr, which contains all top-level groups and options.
-  std::map<Record*, std::vector<Record*> > OptionsInGroup;
-  std::map<Record*, std::vector<Record*> > GroupsInGroup;
-  std::map<Record*, std::vector<Record*> > Aliases;
+  std::map<const Record *, std::vector<const Record *>> OptionsInGroup;
+  std::map<const Record *, std::vector<const Record *>> GroupsInGroup;
+  std::map<const Record *, std::vector<const Record *>> Aliases;
 
-  std::map<std::string, Record*> OptionsByName;
-  for (Record *R : Records.getAllDerivedDefinitions("Option"))
+  std::map<std::string, const Record *> OptionsByName;
+  for (const Record *R : Records.getAllDerivedDefinitions("Option"))
     OptionsByName[std::string(R->getValueAsString("Name"))] = R;
 
-  auto Flatten = [](Record *R) {
+  auto Flatten = [](const Record *R) {
     return R->getValue("DocFlatten") && R->getValueAsBit("DocFlatten");
   };
 
-  auto SkipFlattened = [&](Record *R) -> Record* {
+  auto SkipFlattened = [&](const Record *R) -> const Record * {
     while (R && Flatten(R)) {
       auto *G = dyn_cast<DefInit>(R->getValueInit("Group"));
       if (!G)
@@ -91,17 +91,17 @@ Documentation extractDocumentation(RecordKeeper &Records,
     return R;
   };
 
-  for (Record *R : Records.getAllDerivedDefinitions("OptionGroup")) {
+  for (const Record *R : Records.getAllDerivedDefinitions("OptionGroup")) {
     if (Flatten(R))
       continue;
 
-    Record *Group = nullptr;
+    const Record *Group = nullptr;
     if (auto *G = dyn_cast<DefInit>(R->getValueInit("Group")))
       Group = SkipFlattened(G->getDef());
     GroupsInGroup[Group].push_back(R);
   }
 
-  for (Record *R : Records.getAllDerivedDefinitions("Option")) {
+  for (const Record *R : Records.getAllDerivedDefinitions("Option")) {
     if (auto *A = dyn_cast<DefInit>(R->getValueInit("Alias"))) {
       Aliases[A->getDef()].push_back(R);
       continue;
@@ -120,33 +120,33 @@ Documentation extractDocumentation(RecordKeeper &Records,
       }
     }
 
-    Record *Group = nullptr;
+    const Record *Group = nullptr;
     if (auto *G = dyn_cast<DefInit>(R->getValueInit("Group")))
       Group = SkipFlattened(G->getDef());
     OptionsInGroup[Group].push_back(R);
   }
 
-  auto CompareByName = [](Record *A, Record *B) {
+  auto CompareByName = [](const Record *A, const Record *B) {
     return A->getValueAsString("Name") < B->getValueAsString("Name");
   };
 
-  auto CompareByLocation = [](Record *A, Record *B) {
+  auto CompareByLocation = [](const Record *A, const Record *B) {
     return A->getLoc()[0].getPointer() < B->getLoc()[0].getPointer();
   };
 
-  auto DocumentationForOption = [&](Record *R) -> DocumentedOption {
+  auto DocumentationForOption = [&](const Record *R) -> DocumentedOption {
     auto &A = Aliases[R];
     llvm::sort(A, CompareByName);
     return {R, std::move(A)};
   };
 
-  std::function<Documentation(Record *)> DocumentationForGroup =
-      [&](Record *R) -> Documentation {
+  std::function<Documentation(const Record *)> DocumentationForGroup =
+      [&](const Record *R) -> Documentation {
     Documentation D;
 
     auto &Groups = GroupsInGroup[R];
     llvm::sort(Groups, CompareByLocation);
-    for (Record *G : Groups) {
+    for (const Record *G : Groups) {
       D.Groups.emplace_back();
       D.Groups.back().Group = G;
       Documentation &Base = D.Groups.back();
@@ -157,7 +157,7 @@ Documentation extractDocumentation(RecordKeeper &Records,
 
     auto &Options = OptionsInGroup[R];
     llvm::sort(Options, CompareByName);
-    for (Record *O : Options)
+    for (const Record *O : Options)
       if (isOptionVisible(O, DocInfo))
         D.Options.push_back(DocumentationForOption(O));
 
@@ -444,7 +444,7 @@ void emitDocumentation(int Depth, const Documentation &Doc,
 
 }  // namespace
 
-void clang::EmitClangOptDocs(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitClangOptDocs(const RecordKeeper &Records, raw_ostream &OS) {
   const Record *DocInfo = Records.getDef("GlobalDocumentation");
   if (!DocInfo) {
     PrintFatalError("The GlobalDocumentation top-level definition is missing, "
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 3a424c9..fe55ef2 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -73,7 +73,8 @@ void EmitClangAttrNodeTraverse(llvm::RecordKeeper &Records,
                                llvm::raw_ostream &OS);
 void EmitClangAttrDocTable(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
-void EmitClangBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitClangBuiltins(const llvm::RecordKeeper &Records,
+                       llvm::raw_ostream &OS);
 
 void EmitClangDiagsDefs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS,
                         const std::string &Component);
@@ -83,18 +84,18 @@ void EmitClangDiagsIndexName(llvm::RecordKeeper &Records,
 
 void EmitClangSACheckers(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
-void EmitClangCommentHTMLTags(llvm::RecordKeeper &Records,
+void EmitClangCommentHTMLTags(const llvm::RecordKeeper &Records,
                               llvm::raw_ostream &OS);
-void EmitClangCommentHTMLTagsProperties(llvm::RecordKeeper &Records,
+void EmitClangCommentHTMLTagsProperties(const llvm::RecordKeeper &Records,
                                         llvm::raw_ostream &OS);
-void EmitClangCommentHTMLNamedCharacterReferences(llvm::RecordKeeper &Records,
-                                                  llvm::raw_ostream &OS);
+void EmitClangCommentHTMLNamedCharacterReferences(
+    const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
-void EmitClangCommentCommandInfo(llvm::RecordKeeper &Records,
+void EmitClangCommentCommandInfo(const llvm::RecordKeeper &Records,
                                  llvm::raw_ostream &OS);
-void EmitClangCommentCommandList(llvm::RecordKeeper &Records,
+void EmitClangCommentCommandList(const llvm::RecordKeeper &Records,
                                  llvm::raw_ostream &OS);
-void EmitClangOpcodes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitClangOpcodes(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
 void EmitClangSyntaxNodeList(llvm::RecordKeeper &Records,
                              llvm::raw_ostream &OS);
@@ -142,16 +143,16 @@ void EmitCdeBuiltinAliases(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
 void EmitClangAttrDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitClangDiagDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitClangOptDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitClangOptDocs(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
-void EmitClangOpenCLBuiltins(llvm::RecordKeeper &Records,
+void EmitClangOpenCLBuiltins(const llvm::RecordKeeper &Records,
                              llvm::raw_ostream &OS);
-void EmitClangOpenCLBuiltinHeader(llvm::RecordKeeper &Records,
+void EmitClangOpenCLBuiltinHeader(const llvm::RecordKeeper &Records,
                                   llvm::raw_ostream &OS);
-void EmitClangOpenCLBuiltinTests(llvm::RecordKeeper &Records,
+void EmitClangOpenCLBuiltinTests(const llvm::RecordKeeper &Records,
                                  llvm::raw_ostream &OS);
 
-void EmitClangDataCollectors(llvm::RecordKeeper &Records,
+void EmitClangDataCollectors(const llvm::RecordKeeper &Records,
                              llvm::raw_ostream &OS);
 
 void EmitTestPragmaAttributeSupportedAttributes(llvm::RecordKeeper &Records,
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c
index 0dd3977..ea2da23 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64.c
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c
@@ -14,7 +14,7 @@
 
 #include "aarch64.h"
 
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64)
 #error This file is intended only for aarch64-based targets
 #endif
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
index bf84a2f..666e6f3 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
@@ -151,10 +151,6 @@ DTLS::DTV *DTLS_on_tls_get_addr(void *arg_void, void *res,
     // This may happen inside the DTOR of main thread, so just ignore it.
     tls_size = 0;
   }
-  if (tls_size) {
-    CHECK_LE(tls_beg, reinterpret_cast<uptr>(res) - kDtvOffset);
-    CHECK_LT(reinterpret_cast<uptr>(res) - kDtvOffset, tls_beg + tls_size);
-  }
   dtv->beg = tls_beg;
   dtv->size = tls_size;
   return dtv;
diff --git a/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c b/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c
index 857bec9..fd1ce0c 100644
--- a/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c
+++ b/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c
@@ -11,8 +11,8 @@
 // RUN: %clangxx_asan -x c -DSO_NAME=f1 %s -shared -o %t-f1.so -fPIC
 // RUN: %clangxx_asan -x c -DSO_NAME=f2 %s -shared -o %t-f2.so -fPIC
 // RUN: %clangxx_asan %s -ldl -pthread -o %t
-// RUN: %env_asan_opts=verbosity=2 %run not %t 0 3
-// RUN: %env_asan_opts=verbosity=2 %run %t 2 3
+// RUN: %run %t 0 3
+// RUN: %run %t 2 3
 // RUN: %env_asan_opts=verbosity=2 %run %t 10 2 2>&1 | FileCheck %s
 // RUN: %env_asan_opts=verbosity=2:intercept_tls_get_addr=1 %run %t 10 2 2>&1 | FileCheck %s
 // RUN: %env_asan_opts=verbosity=2:intercept_tls_get_addr=0 %run %t 10 2 2>&1 | FileCheck %s --check-prefix=CHECK0
@@ -29,7 +29,6 @@
 // CHECK-NOT: num_live_dtls 5
 //
 // CHECK0-NOT: __tls_get_addr
-
 /*
 cc=your-compiler
 
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 10aef72..e94a49f 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -2744,7 +2744,6 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
               (!procedure->IsElemental() && nonElemental)) {
             int d{ComputeCudaMatchingDistance(
                 context_.languageFeatures(), *procedure, localActuals)};
-            llvm::errs() << "matching distance: " << d << "\n";
             if (d != crtMatchingDistance) {
               if (d > crtMatchingDistance) {
                 continue;
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 45dfe3e..8643c9b 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -75,6 +75,10 @@ function(_get_compile_options_from_config output_var)
     list(APPEND config_options "-DLIBC_TYPES_TIME_T_IS_32_BIT")
   endif()
 
+  if(LIBC_ADD_NULL_CHECKS)
+    list(APPEND config_options "-DLIBC_ADD_NULL_CHECKS")
+  endif()
+
   set(${output_var} ${config_options} PARENT_SCOPE)
 endfunction(_get_compile_options_from_config)
 
diff --git a/libc/config/config.json b/libc/config/config.json
index 2e72c0a..7dfbb56 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -94,5 +94,11 @@
       "value": false,
       "doc": "Force the size of time_t to 64 bits, even on platforms where compatibility considerations would otherwise make it 32-bit."
     }
+  },
+  "general": {
+    "LIBC_ADD_NULL_CHECKS": {
+      "value": true,
+      "doc": "Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior."
+    }
   }
 }
diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 54ca5d5..86875d4 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -30,6 +30,8 @@ to learn about the defaults for your platform and target.
     - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience.
 * **"errno" options**
     - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM.
+* **"general" options**
+    - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior.
 * **"math" options**
     - ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST.
 * **"printf" options**
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index a2fad9b..e0b65b7 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -143,4 +143,22 @@ add_proxy_header_library(
     libc.include.llvm-libc-macros.limits_macros
 )
 
+add_proxy_header_library(
+  link_macros
+  HDRS
+    link_macros.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-macros.link_macros
+    libc.include.link
+)
+
+add_proxy_header_library(
+  sys_auxv_macros
+  HDRS
+    sys_auxv_macros.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-macros.sys_auxv_macros
+    libc.include.sys_auxv
+)
+
 add_subdirectory(types)
diff --git a/libc/hdr/link_macros.h b/libc/hdr/link_macros.h
new file mode 100644
index 0000000..8a78a86
--- /dev/null
+++ b/libc/hdr/link_macros.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from link.h ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_LINK_MACROS_H
+#define LLVM_LIBC_HDR_LINK_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/link-macros.h"
+
+#else // Overlay mode
+
+#include <link.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_LINK_MACROS_H
diff --git a/libc/hdr/sys_auxv_macros.h b/libc/hdr/sys_auxv_macros.h
new file mode 100644
index 0000000..c04011b
--- /dev/null
+++ b/libc/hdr/sys_auxv_macros.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from sys/auxv.h ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_SYS_AUXV_MACROS_H
+#define LLVM_LIBC_HDR_SYS_AUXV_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/sys-auxv-macros.h"
+
+#else // Overlay mode
+
+#include <sys/auxv.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_SYS_AUXV_MACROS_H
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 9bd1e29..0302ad6 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -192,6 +192,9 @@ add_header_library(
     libc.src.__support.CPP.optional
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.config
+    libc.src.__support.macros.null_check
+    libc.src.__support.macros.optimization
     libc.src.errno.errno
 )
 
diff --git a/libc/src/__support/OSUtil/linux/CMakeLists.txt b/libc/src/__support/OSUtil/linux/CMakeLists.txt
index 089cad4..6c70149 100644
--- a/libc/src/__support/OSUtil/linux/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/CMakeLists.txt
@@ -23,3 +23,33 @@ add_object_library(
     libc.hdr.types.struct_f_owner_ex
     libc.hdr.types.off_t
 )
+
+add_header_library(
+  vdso_sym
+  HDRS
+    vdso_sym.h
+  DEPENDS
+    libc.src.__support.common
+)
+
+add_object_library(
+  vdso
+  HDRS
+    vdso.h
+  SRCS
+    vdso.cpp
+  DEPENDS
+    .${LIBC_TARGET_ARCHITECTURE}.vdso
+    libc.src.__support.CPP.array
+    libc.src.__support.CPP.optional
+    libc.src.__support.CPP.string_view
+    libc.src.__support.threads.callonce
+    libc.src.__support.threads.linux.futex_word_type
+    libc.hdr.types.struct_timeval
+    libc.hdr.types.struct_timespec
+    libc.hdr.types.clockid_t
+    libc.hdr.types.time_t
+    libc.hdr.link_macros
+    libc.src.errno.errno
+    libc.src.sys.auxv.getauxval
+)
diff --git a/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt b/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt
index eea9bad..d9451a1 100644
--- a/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt
@@ -5,3 +5,13 @@ add_header_library(
   DEPENDS
     libc.src.__support.common
 )
+
+add_header_library(
+  vdso
+  HDRS
+    vdso.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.CPP.string_view
+    libc.src.__support.OSUtil.linux.vdso_sym
+)
diff --git a/libc/src/__support/OSUtil/linux/aarch64/vdso.h b/libc/src/__support/OSUtil/linux/aarch64/vdso.h
new file mode 100644
index 0000000..3c4c620
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/aarch64/vdso.h
@@ -0,0 +1,37 @@
+//===---------- aarch64 vdso configuration ------------------------* C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/linux/vdso_sym.h"
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+// translate VDSOSym to symbol names
+// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/kernel/vdso/vdso.lds.S
+LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) {
+  switch (sym) {
+  case VDSOSym::RTSigReturn:
+    return "__kernel_rt_sigreturn";
+  case VDSOSym::GetTimeOfDay:
+    return "__kernel_gettimeofday";
+  case VDSOSym::ClockGetTime:
+    return "__kernel_clock_gettime";
+  case VDSOSym::ClockGetRes:
+    return "__kernel_clock_getres";
+  default:
+    return "";
+  }
+}
+
+// symbol versions
+LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) {
+  return "LINUX_2.6.39";
+}
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H
diff --git a/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt b/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt
index 733366f..d991f7e 100644
--- a/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt
@@ -5,3 +5,13 @@ add_header_library(
   DEPENDS
     libc.src.__support.common
 )
+
+add_header_library(
+  vdso
+  HDRS
+    vdso.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.CPP.string_view
+    libc.src.__support.OSUtil.linux.vdso_sym
+)
diff --git a/libc/src/__support/OSUtil/linux/arm/vdso.h b/libc/src/__support/OSUtil/linux/arm/vdso.h
new file mode 100644
index 0000000..3de5860
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/arm/vdso.h
@@ -0,0 +1,37 @@
+//===---------- arm vdso configuration ----------------------------* C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/linux/vdso_sym.h"
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+// translate VDSOSym to symbol names
+// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm/vdso/vdso.lds.S
+LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) {
+  switch (sym) {
+  case VDSOSym::ClockGetTime:
+    return "__vdso_clock_gettime";
+  case VDSOSym::GetTimeOfDay:
+    return "__vdso_gettimeofday";
+  case VDSOSym::ClockGetRes:
+    return "__vdso_clock_getres";
+  case VDSOSym::ClockGetTime64:
+    return "__vdso_clock_gettime64";
+  default:
+    return "";
+  }
+}
+
+// symbol versions
+LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) {
+  return "LINUX_2.6";
+}
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H
diff --git a/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt b/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt
index e271204..eb93dd4 100644
--- a/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt
@@ -5,3 +5,13 @@ add_header_library(
   DEPENDS
     libc.src.__support.common
 )
+
+add_header_library(
+  vdso
+  HDRS
+    vdso.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.CPP.string_view
+    libc.src.__support.OSUtil.linux.vdso_sym
+)
diff --git a/libc/src/__support/OSUtil/linux/riscv/vdso.h b/libc/src/__support/OSUtil/linux/riscv/vdso.h
new file mode 100644
index 0000000..24ddb25
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/riscv/vdso.h
@@ -0,0 +1,43 @@
+//===---------- RISC-V vdso configuration -------------------------* C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/linux/vdso_sym.h"
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+// translate VDSOSym to symbol names
+// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/riscv/kernel/vdso/vdso.lds.S
+LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) {
+  switch (sym) {
+  case VDSOSym::RTSigReturn:
+    return "__vdso_rt_sigreturn";
+  case VDSOSym::GetTimeOfDay:
+    return "__vdso_gettimeofday";
+  case VDSOSym::ClockGetTime:
+    return "__vdso_clock_gettime";
+  case VDSOSym::ClockGetRes:
+    return "__vdso_clock_getres";
+  case VDSOSym::GetCpu:
+    return "__vdso_getcpu";
+  case VDSOSym::FlushICache:
+    return "__vdso_flush_icache";
+  case VDSOSym::RiscvHwProbe:
+    return "__vdso_riscv_hwprobe";
+  default:
+    return "";
+  }
+}
+
+// symbol versions
+LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) {
+  return "LINUX_4.15";
+}
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H
diff --git a/libc/src/__support/OSUtil/linux/vdso.cpp b/libc/src/__support/OSUtil/linux/vdso.cpp
new file mode 100644
index 0000000..cb43764
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/vdso.cpp
@@ -0,0 +1,237 @@
+//===------------- Linux VDSO Implementation --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "src/__support/OSUtil/linux/vdso.h"
+#include "hdr/link_macros.h"
+#include "hdr/sys_auxv_macros.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/threads/callonce.h"
+#include "src/__support/threads/linux/futex_word.h"
+#include "src/errno/libc_errno.h"
+#include "src/sys/auxv/getauxval.h"
+#include <linux/auxvec.h>
+
+// TODO: This is a temporary workaround to avoid including elf.h
+// Include our own headers for ElfW and friends once we have them.
+namespace LIBC_NAMESPACE_DECL {
+
+namespace vdso {
+
+Symbol::VDSOArray Symbol::global_cache{};
+CallOnceFlag Symbol::once_flag = callonce_impl::NOT_CALLED;
+
+namespace {
+// See https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/symverdefs.html
+struct Verdaux {
+  ElfW(Word) vda_name; /* Version or dependency names */
+  ElfW(Word) vda_next; /* Offset in bytes to next verdaux
+                          entry */
+};
+struct Verdef {
+  ElfW(Half) vd_version; /* Version revision */
+  ElfW(Half) vd_flags;   /* Version information */
+  ElfW(Half) vd_ndx;     /* Version Index */
+  ElfW(Half) vd_cnt;     /* Number of associated aux entries */
+  ElfW(Word) vd_hash;    /* Version name hash value */
+  ElfW(Word) vd_aux;     /* Offset in bytes to verdaux array */
+  ElfW(Word) vd_next;    /* Offset in bytes to next verdef entry */
+  Verdef *next() const {
+    if (vd_next == 0)
+      return nullptr;
+    return reinterpret_cast<Verdef *>(reinterpret_cast<uintptr_t>(this) +
+                                      vd_next);
+  }
+  Verdaux *aux() const {
+    return reinterpret_cast<Verdaux *>(reinterpret_cast<uintptr_t>(this) +
+                                       vd_aux);
+  }
+};
+
+// version search procedure specified by
+// https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/symversion.html#SYMVERTBL
+cpp::string_view find_version(Verdef *verdef, ElfW(Half) * versym,
+                              const char *strtab, size_t idx) {
+  constexpr ElfW(Half) VER_FLG_BASE = 0x1;
+  if (!versym)
+    return "";
+  ElfW(Half) identifier = versym[idx] & 0x7FFF;
+  // iterate through all version definitions
+  for (Verdef *def = verdef; def != nullptr; def = def->next()) {
+    // skip if this is a file-level version
+    if (def->vd_flags & VER_FLG_BASE)
+      continue;
+    // check if the version identifier matches. Highest bit is used to determine
+    // whether the symbol is local. Only lower 15 bits are used for version
+    // identifier.
+    if ((def->vd_ndx & 0x7FFF) == identifier) {
+      Verdaux *aux = def->aux();
+      return strtab + aux->vda_name;
+    }
+  }
+  return "";
+}
+
+size_t shdr_get_symbol_count(ElfW(Shdr) * vdso_shdr, size_t e_shnum) {
+  if (!vdso_shdr)
+    return 0;
+  // iterate all sections until we locate the dynamic symbol section
+  for (size_t i = 0; i < e_shnum; ++i) {
+    // dynamic symbol section is a table section
+    // therefore, the number of entries can be computed as the ratio
+    // of the section size to the size of a single entry
+    if (vdso_shdr[i].sh_type == SHT_DYNSYM)
+      return vdso_shdr[i].sh_size / vdso_shdr[i].sh_entsize;
+  }
+  return 0;
+}
+
+struct VDSOSymbolTable {
+  const char *strtab;
+  ElfW(Sym) * symtab;
+  // The following can be nullptr if the vDSO does not have versioning
+  ElfW(Half) * versym;
+  Verdef *verdef;
+
+  void populate_symbol_cache(Symbol::VDSOArray &symbol_table,
+                             size_t symbol_count, ElfW(Addr) vdso_addr) {
+    for (size_t i = 0, e = symbol_table.size(); i < e; ++i) {
+      Symbol sym = i;
+      cpp::string_view name = sym.name();
+      cpp::string_view version = sym.version();
+      if (name.empty())
+        continue;
+
+      for (size_t j = 0; j < symbol_count; ++j) {
+        if (name == strtab + symtab[j].st_name) {
+          // we find a symbol with desired name
+          // now we need to check if it has the right version
+          if (versym && verdef &&
+              version != find_version(verdef, versym, strtab, j))
+            continue;
+
+          // put the symbol address into the symbol table
+          symbol_table[i] =
+              reinterpret_cast<void *>(vdso_addr + symtab[j].st_value);
+        }
+      }
+    }
+  }
+};
+
+struct PhdrInfo {
+  ElfW(Addr) vdso_addr;
+  ElfW(Dyn) * vdso_dyn;
+  static cpp::optional<PhdrInfo> from(ElfW(Phdr) * vdso_phdr, size_t e_phnum,
+                                      uintptr_t vdso_ehdr_addr) {
+    constexpr ElfW(Addr) INVALID_ADDR = static_cast<ElfW(Addr)>(-1);
+    ElfW(Addr) vdso_addr = INVALID_ADDR;
+    ElfW(Dyn) *vdso_dyn = nullptr;
+    if (!vdso_phdr)
+      return cpp::nullopt;
+    // iterate through all the program headers until we get the desired pieces
+    for (size_t i = 0; i < e_phnum; ++i) {
+      if (vdso_phdr[i].p_type == PT_DYNAMIC)
+        vdso_dyn = reinterpret_cast<ElfW(Dyn) *>(vdso_ehdr_addr +
+                                                 vdso_phdr[i].p_offset);
+
+      if (vdso_phdr[i].p_type == PT_LOAD)
+        vdso_addr =
+            vdso_ehdr_addr + vdso_phdr[i].p_offset - vdso_phdr[i].p_vaddr;
+
+      if (vdso_addr && vdso_dyn)
+        return PhdrInfo{vdso_addr, vdso_dyn};
+    }
+
+    return cpp::nullopt;
+  }
+
+  cpp::optional<VDSOSymbolTable> populate_symbol_table() {
+    const char *strtab = nullptr;
+    ElfW(Sym) *symtab = nullptr;
+    ElfW(Half) *versym = nullptr;
+    Verdef *verdef = nullptr;
+    for (ElfW(Dyn) *d = vdso_dyn; d->d_tag != DT_NULL; ++d) {
+      switch (d->d_tag) {
+      case DT_STRTAB:
+        strtab = reinterpret_cast<const char *>(vdso_addr + d->d_un.d_ptr);
+        break;
+      case DT_SYMTAB:
+        symtab = reinterpret_cast<ElfW(Sym) *>(vdso_addr + d->d_un.d_ptr);
+        break;
+      case DT_VERSYM:
+        versym = reinterpret_cast<uint16_t *>(vdso_addr + d->d_un.d_ptr);
+        break;
+      case DT_VERDEF:
+        verdef = reinterpret_cast<Verdef *>(vdso_addr + d->d_un.d_ptr);
+        break;
+      }
+      if (strtab && symtab && versym && verdef)
+        break;
+    }
+    if (strtab == nullptr || symtab == nullptr)
+      return cpp::nullopt;
+
+    return VDSOSymbolTable{strtab, symtab, versym, verdef};
+  }
+};
+} // namespace
+
+void Symbol::initialize_vdso_global_cache() {
+  // first clear the symbol table
+  for (auto &i : global_cache)
+    i = nullptr;
+
+  // get the address of the VDSO, protect errno since getauxval may change
+  // it
+  int errno_backup = libc_errno;
+  uintptr_t vdso_ehdr_addr = getauxval(AT_SYSINFO_EHDR);
+  // Get the memory address of the vDSO ELF header.
+  auto vdso_ehdr = reinterpret_cast<ElfW(Ehdr) *>(vdso_ehdr_addr);
+  // leave the table unpopulated if we don't have vDSO
+  if (vdso_ehdr == nullptr) {
+    libc_errno = errno_backup;
+    return;
+  }
+
+  // locate the section header inside the elf using the section header
+  // offset
+  auto vdso_shdr =
+      reinterpret_cast<ElfW(Shdr) *>(vdso_ehdr_addr + vdso_ehdr->e_shoff);
+  size_t symbol_count = shdr_get_symbol_count(vdso_shdr, vdso_ehdr->e_shnum);
+
+  // early return if no symbol is found
+  if (symbol_count == 0)
+    return;
+
+  // We need to find both the loadable segment and the dynamic linking of
+  // the vDSO. compute vdso_phdr as the program header using the program
+  // header offset
+  ElfW(Phdr) *vdso_phdr =
+      reinterpret_cast<ElfW(Phdr) *>(vdso_ehdr_addr + vdso_ehdr->e_phoff);
+  cpp::optional<PhdrInfo> phdr_info =
+      PhdrInfo::from(vdso_phdr, vdso_ehdr->e_phnum, vdso_ehdr_addr);
+  // early return if either the dynamic linking or the loadable segment is
+  // not found
+  if (!phdr_info.has_value())
+    return;
+
+  // now, locate several more tables inside the dynmaic linking section
+  cpp::optional<VDSOSymbolTable> vdso_symbol_table =
+      phdr_info->populate_symbol_table();
+
+  // early return if we can't find any required fields of the symbol table
+  if (!vdso_symbol_table.has_value())
+    return;
+
+  // finally, populate the global symbol table cache
+  vdso_symbol_table->populate_symbol_cache(global_cache, symbol_count,
+                                           phdr_info->vdso_addr);
+}
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/OSUtil/linux/vdso.h b/libc/src/__support/OSUtil/linux/vdso.h
new file mode 100644
index 0000000..a5108b3
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/vdso.h
@@ -0,0 +1,81 @@
+//===------------- Linux VDSO Header ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H
+#include "src/__support/CPP/array.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/threads/callonce.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_X86)
+#include "x86_64/vdso.h"
+#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
+#include "aarch64/vdso.h"
+#elif defined(LIBC_TARGET_ARCH_IS_ARM)
+#include "arm/vdso.h"
+#elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
+#include "riscv/vdso.h"
+#else
+#error "unknown arch"
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+
+class Symbol {
+  VDSOSym sym;
+
+public:
+  LIBC_INLINE_VAR static constexpr size_t COUNT =
+      static_cast<size_t>(VDSOSym::VDSOSymCount);
+  LIBC_INLINE constexpr explicit Symbol(VDSOSym sym) : sym(sym) {}
+  LIBC_INLINE constexpr Symbol(size_t idx) : sym(static_cast<VDSOSym>(idx)) {}
+  LIBC_INLINE constexpr cpp::string_view name() const {
+    return symbol_name(sym);
+  }
+  LIBC_INLINE constexpr cpp::string_view version() const {
+    return symbol_version(sym);
+  }
+  LIBC_INLINE constexpr operator size_t() const {
+    return static_cast<size_t>(sym);
+  }
+  LIBC_INLINE constexpr bool is_valid() const {
+    return *this < Symbol::global_cache.size();
+  }
+  using VDSOArray = cpp::array<void *, Symbol::COUNT>;
+
+private:
+  static CallOnceFlag once_flag;
+  static VDSOArray global_cache;
+  static void initialize_vdso_global_cache();
+
+  LIBC_INLINE void *get() const {
+    if (name().empty() || !is_valid())
+      return nullptr;
+
+    callonce(&once_flag, Symbol::initialize_vdso_global_cache);
+    return (global_cache[*this]);
+  }
+  template <VDSOSym sym> friend struct TypedSymbol;
+};
+
+template <VDSOSym sym> struct TypedSymbol {
+  LIBC_INLINE constexpr operator VDSOSymType<sym>() const {
+    return cpp::bit_cast<VDSOSymType<sym>>(Symbol{sym}.get());
+  }
+  template <typename... Args>
+  LIBC_INLINE auto operator()(Args &&...args) const {
+    return this->operator VDSOSymType<sym>()(cpp::forward<Args>(args)...);
+  }
+};
+
+} // namespace vdso
+
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H
diff --git a/libc/src/__support/OSUtil/linux/vdso_sym.h b/libc/src/__support/OSUtil/linux/vdso_sym.h
new file mode 100644
index 0000000..eb5f204
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/vdso_sym.h
@@ -0,0 +1,70 @@
+//===------------- Linux VDSO Symbols ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "hdr/types/clock_t.h"
+#include "hdr/types/clockid_t.h"
+#include "hdr/types/struct_timespec.h"
+#include "hdr/types/struct_timeval.h"
+#include "hdr/types/time_t.h"
+#include "src/__support/common.h"
+#include <stddef.h> // For size_t.
+
+// NOLINTBEGIN(llvmlibc-implementation-in-namespace)
+// TODO: some of the following can be defined via proxy headers.
+struct __kernel_timespec;
+struct timezone;
+struct riscv_hwprobe;
+struct getcpu_cache;
+struct cpu_set_t;
+// NOLINTEND(llvmlibc-implementation-in-namespace)
+
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+
+enum class VDSOSym {
+  ClockGetTime,
+  ClockGetTime64,
+  GetTimeOfDay,
+  GetCpu,
+  Time,
+  ClockGetRes,
+  RTSigReturn,
+  FlushICache,
+  RiscvHwProbe,
+  VDSOSymCount
+};
+
+template <VDSOSym sym> LIBC_INLINE constexpr auto dispatcher() {
+  if constexpr (sym == VDSOSym::ClockGetTime)
+    return static_cast<int (*)(clockid_t, timespec *)>(nullptr);
+  else if constexpr (sym == VDSOSym::ClockGetTime64)
+    return static_cast<int (*)(clockid_t, __kernel_timespec *)>(nullptr);
+  else if constexpr (sym == VDSOSym::GetTimeOfDay)
+    return static_cast<int (*)(timeval *__restrict, timezone *__restrict)>(
+        nullptr);
+  else if constexpr (sym == VDSOSym::GetCpu)
+    return static_cast<int (*)(unsigned *, unsigned *, getcpu_cache *)>(
+        nullptr);
+  else if constexpr (sym == VDSOSym::Time)
+    return static_cast<time_t (*)(time_t *)>(nullptr);
+  else if constexpr (sym == VDSOSym::ClockGetRes)
+    return static_cast<int (*)(clockid_t, timespec *)>(nullptr);
+  else if constexpr (sym == VDSOSym::RTSigReturn)
+    return static_cast<void (*)(void)>(nullptr);
+  else if constexpr (sym == VDSOSym::FlushICache)
+    return static_cast<void (*)(void *, void *, unsigned int)>(nullptr);
+  else if constexpr (sym == VDSOSym::RiscvHwProbe)
+    return static_cast<int (*)(riscv_hwprobe *, size_t, size_t, cpu_set_t *,
+                               unsigned)>(nullptr);
+  else
+    return static_cast<void *>(nullptr);
+}
+
+template <VDSOSym sym> using VDSOSymType = decltype(dispatcher<sym>());
+
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt b/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt
index a7f2d74e6..1324491 100644
--- a/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt
@@ -5,3 +5,13 @@ add_header_library(
   DEPENDS
     libc.src.__support.common
 )
+
+add_header_library(
+  vdso
+  HDRS
+    vdso.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.CPP.string_view
+    libc.src.__support.OSUtil.linux.vdso_sym
+)
diff --git a/libc/src/__support/OSUtil/linux/x86_64/vdso.h b/libc/src/__support/OSUtil/linux/x86_64/vdso.h
new file mode 100644
index 0000000..abe7c33
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/x86_64/vdso.h
@@ -0,0 +1,43 @@
+//===---------- x86/x86_64 vdso configuration ---------------------* C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/linux/vdso_sym.h"
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+// translate VDSOSym to symbol names
+// On x86, there are symbols defined without the __vdso_ prefix, however,
+// it is suggested that one should use the __vdso_ prefix.
+// Additionally, there is also an __vdso_sgx_enter_enclave, it is for the SGX
+// support, we do not include it here for now.
+// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/entry/vdso/vdso.lds.S
+LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) {
+  switch (sym) {
+  case VDSOSym::ClockGetTime:
+    return "__vdso_clock_gettime";
+  case VDSOSym::GetTimeOfDay:
+    return "__vdso_gettimeofday";
+  case VDSOSym::GetCpu:
+    return "__vdso_getcpu";
+  case VDSOSym::Time:
+    return "__vdso_time";
+  case VDSOSym::ClockGetRes:
+    return "__vdso_clock_getres";
+  default:
+    return "";
+  }
+}
+
+// symbol versions
+LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) {
+  return "LINUX_2.6";
+}
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H
diff --git a/libc/src/__support/macros/CMakeLists.txt b/libc/src/__support/macros/CMakeLists.txt
index bcd47c3..99d4f64 100644
--- a/libc/src/__support/macros/CMakeLists.txt
+++ b/libc/src/__support/macros/CMakeLists.txt
@@ -27,3 +27,13 @@ add_header_library(
   DEPENDS
     libc.src.__support.macros.properties.compiler
 )
+
+add_header_library(
+  null_check
+  HDRS
+    null_check.h
+  DEPENDS
+    .config
+    .optimization
+    .sanitizer
+)
diff --git a/libc/src/__support/macros/null_check.h b/libc/src/__support/macros/null_check.h
new file mode 100644
index 0000000..400f7d8
--- /dev/null
+++ b/libc/src/__support/macros/null_check.h
@@ -0,0 +1,33 @@
+//===-- Safe nullptr check --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H
+#define LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/sanitizer.h"
+
+#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER)
+// Use volatile to prevent undefined behavior of dereferencing nullptr.
+// Intentionally crashing with SIGSEGV.
+#define LIBC_CRASH_ON_NULLPTR(PTR)                                             \
+  do {                                                                         \
+    if (LIBC_UNLIKELY(PTR == nullptr)) {                                       \
+      volatile auto *crashing = PTR;                                           \
+      [[maybe_unused]] volatile auto crash = *crashing;                        \
+      __builtin_trap();                                                        \
+    }                                                                          \
+  } while (0)
+#else
+#define LIBC_CRASH_ON_NULLPTR(ptr)                                             \
+  do {                                                                         \
+  } while (0)
+#endif
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H
diff --git a/libc/src/__support/macros/sanitizer.h b/libc/src/__support/macros/sanitizer.h
index c4f8b5b..c20412e 100644
--- a/libc/src/__support/macros/sanitizer.h
+++ b/libc/src/__support/macros/sanitizer.h
@@ -15,7 +15,25 @@
 // Functions to unpoison memory
 //-----------------------------------------------------------------------------
 
+#if LIBC_HAS_FEATURE(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#define LIBC_HAS_ADDRESS_SANITIZER
+#endif
+
 #if LIBC_HAS_FEATURE(memory_sanitizer)
+#define LIBC_HAS_MEMORY_SANITIZER
+#endif
+
+#if LIBC_HAS_FEATURE(undefined_behavior_sanitizer)
+#define LIBC_HAS_UNDEFINED_BEHAVIOR_SANITIZER
+#endif
+
+#if defined(LIBC_HAS_ADDRESS_SANITIZER) ||                                     \
+    defined(LIBC_HAS_MEMORY_SANITIZER) ||                                      \
+    defined(LIBC_HAS_UNDEFINED_BEHAVIOR_SANITIZER)
+#define LIBC_HAS_SANITIZER
+#endif
+
+#ifdef LIBC_HAS_MEMORY_SANITIZER
 // Only perform MSAN unpoison in non-constexpr context.
 #include <sanitizer/msan_interface.h>
 #define MSAN_UNPOISON(addr, size)                                              \
@@ -27,8 +45,7 @@
 #define MSAN_UNPOISON(ptr, size)
 #endif
 
-#if LIBC_HAS_FEATURE(address_sanitizer)
-#define LIBC_HAVE_ADDRESS_SANITIZER
+#ifdef LIBC_HAS_ADDRESS_SANITIZER
 #include <sanitizer/asan_interface.h>
 #define ASAN_POISON_MEMORY_REGION(addr, size)                                  \
   __asan_poison_memory_region((addr), (size))
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index ffd6ebf..a452b3a 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -20,6 +20,8 @@
 #include "src/__support/detailed_powers_of_ten.h"
 #include "src/__support/high_precision_decimal.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/null_check.h"
+#include "src/__support/macros/optimization.h"
 #include "src/__support/str_to_integer.h"
 #include "src/__support/str_to_num_result.h"
 #include "src/__support/uint128.h"
@@ -1208,6 +1210,8 @@ template <class T> LIBC_INLINE StrToNumResult<T> strtonan(const char *arg) {
   using FPBits = typename fputil::FPBits<T>;
   using StorageType = typename FPBits::StorageType;
 
+  LIBC_CRASH_ON_NULLPTR(arg);
+
   FPBits result;
   int error = 0;
   StorageType nan_mantissa = 0;
diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp
index c8a15dd..799aad1 100644
--- a/libc/src/stdlib/atexit.cpp
+++ b/libc/src/stdlib/atexit.cpp
@@ -16,6 +16,7 @@ namespace LIBC_NAMESPACE_DECL {
 
 constinit ExitCallbackList atexit_callbacks;
 Mutex handler_list_mtx(false, false, false, false);
+[[gnu::weak]] extern void teardown_main_tls();
 
 extern "C" {
 
@@ -24,8 +25,11 @@ int __cxa_atexit(AtExitCallback *callback, void *payload, void *) {
 }
 
 void __cxa_finalize(void *dso) {
-  if (!dso)
+  if (!dso) {
     call_exit_callbacks(atexit_callbacks);
+    if (teardown_main_tls)
+      teardown_main_tls();
+  }
 }
 
 } // extern "C"
diff --git a/libc/src/stdlib/quick_exit.cpp b/libc/src/stdlib/quick_exit.cpp
index a5abf3e..29110b3 100644
--- a/libc/src/stdlib/quick_exit.cpp
+++ b/libc/src/stdlib/quick_exit.cpp
@@ -16,9 +16,12 @@
 namespace LIBC_NAMESPACE_DECL {
 
 extern ExitCallbackList at_quick_exit_callbacks;
+[[gnu::weak]] extern void teardown_main_tls();
 
 [[noreturn]] LLVM_LIBC_FUNCTION(void, quick_exit, (int status)) {
   call_exit_callbacks(at_quick_exit_callbacks);
+  if (teardown_main_tls)
+    teardown_main_tls();
   internal::exit(status);
 }
 
diff --git a/libc/src/sys/auxv/getauxval.h b/libc/src/sys/auxv/getauxval.h
index 3e69713..d9da45ff 100644
--- a/libc/src/sys/auxv/getauxval.h
+++ b/libc/src/sys/auxv/getauxval.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_SYS_AUXV_GETAUXVAL_H
 #define LLVM_LIBC_SRC_SYS_AUXV_GETAUXVAL_H
 
+#include "hdr/sys_auxv_macros.h"
 #include "src/__support/macros/config.h"
-#include <sys/auxv.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index f09541b..8bd0c3a 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -17,6 +17,9 @@ extern "C" int main(int argc, char **argv, char **envp);
 
 namespace LIBC_NAMESPACE_DECL {
 
+// FIXME: Factor this out into common logic so we don't need to stub it here.
+void teardown_main_tls() {}
+
 DataEnvironment app;
 
 extern "C" uintptr_t __init_array_start[];
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index ef1e63e..bc529b3 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -19,6 +19,9 @@ namespace LIBC_NAMESPACE_DECL {
 
 DataEnvironment app;
 
+// FIXME: Factor this out into common logic so we don't need to stub it here.
+void teardown_main_tls() {}
+
 extern "C" {
 // Nvidia's 'nvlink' linker does not provide these symbols. We instead need
 // to manually create them and update the globals in the loader implememtation.
diff --git a/libc/startup/linux/do_start.cpp b/libc/startup/linux/do_start.cpp
index 72060b4..ff104c7 100644
--- a/libc/startup/linux/do_start.cpp
+++ b/libc/startup/linux/do_start.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "startup/linux/do_start.h"
+#include "config/linux/app.h"
 #include "include/llvm-libc-macros/link-macros.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/macros/config.h"
@@ -60,6 +61,10 @@ static void call_fini_array_callbacks() {
 }
 
 static ThreadAttributes main_thread_attrib;
+static TLSDescriptor tls;
+// We separate teardown_main_tls from callbacks as callback function themselves
+// may require TLS.
+void teardown_main_tls() { cleanup_tls(tls.addr, tls.size); }
 
 [[noreturn]] void do_start() {
   auto tid = syscall_impl<long>(SYS_gettid);
@@ -122,7 +127,6 @@ static ThreadAttributes main_thread_attrib;
 
   // This descriptor has to be static since its cleanup function cannot
   // capture the context.
-  static TLSDescriptor tls;
   init_tls(tls);
   if (tls.size != 0 && !set_thread_ptr(tls.tp))
     syscall_impl<long>(SYS_exit, 1);
@@ -130,10 +134,7 @@ static ThreadAttributes main_thread_attrib;
   self.attrib = &main_thread_attrib;
   main_thread_attrib.atexit_callback_mgr =
       internal::get_thread_atexit_callback_mgr();
-  // We register the cleanup_tls function to be the last atexit callback to be
-  // invoked. It will tear down the TLS. Other callbacks may depend on TLS (such
-  // as the stack protector canary).
-  atexit([]() { cleanup_tls(tls.addr, tls.size); });
+
   // We want the fini array callbacks to be run after other atexit
   // callbacks are run. So, we register them before running the init
   // array callbacks as they can potentially register their own atexit
diff --git a/libc/test/src/__support/OSUtil/linux/CMakeLists.txt b/libc/test/src/__support/OSUtil/linux/CMakeLists.txt
index bfb072c..ff82616 100644
--- a/libc/test/src/__support/OSUtil/linux/CMakeLists.txt
+++ b/libc/test/src/__support/OSUtil/linux/CMakeLists.txt
@@ -1,3 +1,21 @@
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
   add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
 endif()
+
+add_libc_test(
+  vdso_test
+  SUITE libc-osutil-tests
+  SRCS vdso_test.cpp
+  DEPENDS
+    libc.src.__support.OSUtil.linux.vdso
+    libc.src.__support.OSUtil.osutil
+    libc.hdr.types.struct_sigaction
+    libc.hdr.types.struct_timeval
+    libc.hdr.types.struct_timespec
+    libc.hdr.types.clockid_t
+    libc.hdr.types.time_t
+    libc.hdr.time_macros
+    libc.hdr.signal_macros
+    libc.src.signal.sigaction
+    libc.src.signal.raise
+)
diff --git a/libc/test/src/__support/OSUtil/linux/vdso_test.cpp b/libc/test/src/__support/OSUtil/linux/vdso_test.cpp
new file mode 100644
index 0000000..2363db6
--- /dev/null
+++ b/libc/test/src/__support/OSUtil/linux/vdso_test.cpp
@@ -0,0 +1,162 @@
+//===-- Unittests for VDSO ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/signal_macros.h"
+#include "hdr/time_macros.h"
+#include "hdr/types/clockid_t.h"
+#include "hdr/types/struct_sigaction.h"
+#include "hdr/types/struct_timespec.h"
+#include "hdr/types/struct_timeval.h"
+#include "hdr/types/time_t.h"
+#include "src/__support/OSUtil/linux/vdso.h"
+#include "src/__support/OSUtil/syscall.h"
+#include "src/__support/macros/properties/architectures.h"
+#include "src/signal/raise.h"
+#include "src/signal/sigaction.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/LibcTest.h"
+#include "test/UnitTest/Test.h"
+#include <linux/time_types.h>
+#include <sys/syscall.h>
+
+struct riscv_hwprobe {
+  int64_t key;
+  uint64_t value;
+};
+
+namespace LIBC_NAMESPACE_DECL {
+// For x86_64, we explicitly test some traditional vdso symbols are indeed
+// available.
+
+TEST(LlvmLibcOSUtilVDSOTest, GetTimeOfDay) {
+  vdso::TypedSymbol<vdso::VDSOSym::GetTimeOfDay> symbol;
+  if (!symbol)
+    return;
+  timeval tv;
+  EXPECT_EQ(symbol(&tv, nullptr), 0);
+  // hopefully people are not building time machines using our libc.
+  EXPECT_GT(tv.tv_sec, static_cast<decltype(tv.tv_sec)>(0));
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, Time) {
+  vdso::TypedSymbol<vdso::VDSOSym::Time> symbol;
+  if (!symbol)
+    return;
+  time_t a, b;
+  EXPECT_GT(symbol(&a), static_cast<time_t>(0));
+  EXPECT_GT(symbol(&b), static_cast<time_t>(0));
+  EXPECT_GE(b, a);
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, ClockGetTime) {
+  vdso::TypedSymbol<vdso::VDSOSym::ClockGetTime> symbol;
+  if (!symbol)
+    return;
+  timespec a, b;
+  EXPECT_EQ(symbol(CLOCK_MONOTONIC, &a), 0);
+  EXPECT_EQ(symbol(CLOCK_MONOTONIC, &b), 0);
+  if (a.tv_sec == b.tv_sec) {
+    EXPECT_LT(a.tv_nsec, b.tv_nsec);
+  } else {
+    EXPECT_LT(a.tv_sec, b.tv_sec);
+  }
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, ClockGetTime64) {
+  vdso::TypedSymbol<vdso::VDSOSym::ClockGetTime64> symbol;
+  if (!symbol)
+    return;
+  // See kernel API at
+  // https://elixir.bootlin.com/linux/latest/source/tools/testing/selftests/vDSO/vdso_test_correctness.c#L155
+  __kernel_timespec a, b;
+  EXPECT_EQ(symbol(CLOCK_MONOTONIC, &a), 0);
+  EXPECT_EQ(symbol(CLOCK_MONOTONIC, &b), 0);
+  if (a.tv_sec == b.tv_sec) {
+    EXPECT_LT(a.tv_nsec, b.tv_nsec);
+  } else {
+    EXPECT_LT(a.tv_sec, b.tv_sec);
+  }
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, ClockGetRes) {
+  vdso::TypedSymbol<vdso::VDSOSym::ClockGetRes> symbol;
+  if (!symbol)
+    return;
+  timespec res{};
+  EXPECT_EQ(symbol(CLOCK_MONOTONIC, &res), 0);
+  EXPECT_TRUE(res.tv_sec > 0 || res.tv_nsec > 0);
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, GetCpu) {
+  // The kernel system call has a third argument, which should be passed as
+  // nullptr.
+  vdso::TypedSymbol<vdso::VDSOSym::GetCpu> symbol;
+  if (!symbol)
+    return;
+  unsigned cpu = static_cast<unsigned>(-1), node = static_cast<unsigned>(-1);
+  EXPECT_EQ(symbol(&cpu, &node, nullptr), 0);
+  EXPECT_GE(cpu, 0u);
+  EXPECT_GE(node, 0u);
+}
+
+static bool flag = false;
+static void sigprof_handler [[gnu::used]] (int) { flag = true; }
+
+TEST(LlvmLibcOSUtilVDSOTest, RtSigReturn) {
+  using namespace testing::ErrnoSetterMatcher;
+  // must use struct since there is a function of the same name in the same
+  // scope.
+  struct sigaction sa {};
+  struct sigaction old_sa {};
+  sa.sa_handler = sigprof_handler;
+  sa.sa_flags = SA_RESTORER;
+  vdso::TypedSymbol<vdso::VDSOSym::RTSigReturn> symbol;
+  if (!symbol)
+    return;
+  sa.sa_restorer = symbol;
+  ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGPROF, &sa, &old_sa), Succeeds());
+  raise(SIGPROF);
+  ASSERT_TRUE(flag);
+  flag = false;
+  ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGPROF, &old_sa, nullptr), Succeeds());
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, FlushICache) {
+  vdso::TypedSymbol<vdso::VDSOSym::FlushICache> symbol;
+  if (!symbol)
+    return;
+  char buf[512];
+  // we just check that the flush will not panic the program.
+  // the flags part only take 0/1 as up to kernel 6.10, which is used to
+  // indicate whether the flush is local to the core or global.
+  symbol(buf, buf + sizeof(buf), 0);
+  symbol(buf, buf + sizeof(buf), 1);
+}
+
+// https://docs.kernel.org/6.5/riscv/hwprobe.html
+TEST(LlvmLibcOSUtilVDSOTest, RiscvHwProbe) {
+  using namespace testing::ErrnoSetterMatcher;
+  vdso::TypedSymbol<vdso::VDSOSym::RiscvHwProbe> symbol;
+  if (!symbol)
+    return;
+  // If a key is unknown to the kernel, its key field will be cleared to -1, and
+  // its value set to 0. We expect probes.value are all 0.
+  // Usermode can supply NULL for cpus and 0 for cpu_count as a shortcut for all
+  // online CPUs
+  riscv_hwprobe probes[2] = {{-1, 1}, {-1, 1}};
+  ASSERT_THAT(symbol(/*pairs=*/probes, /*count=*/2, /*cpusetsize=*/0,
+                     /*cpuset=*/nullptr,
+                     /*flags=*/0),
+              Succeeds());
+  for (auto &probe : probes) {
+    EXPECT_EQ(probe.key, static_cast<decltype(probe.key)>(-1));
+    EXPECT_EQ(probe.value, static_cast<decltype(probe.value)>(0));
+  }
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/compiler/CMakeLists.txt b/libc/test/src/compiler/CMakeLists.txt
index 65a9acc..a45fa8c 100644
--- a/libc/test/src/compiler/CMakeLists.txt
+++ b/libc/test/src/compiler/CMakeLists.txt
@@ -7,6 +7,7 @@ add_libc_unittest(
   SRCS
     stack_chk_guard_test.cpp
   DEPENDS
+    libc.hdr.signal_macros
     libc.src.__support.macros.sanitizer
     libc.src.compiler.__stack_chk_fail
     libc.src.string.memset
diff --git a/libc/test/src/compiler/stack_chk_guard_test.cpp b/libc/test/src/compiler/stack_chk_guard_test.cpp
index 6b71e15..4ec8398 100644
--- a/libc/test/src/compiler/stack_chk_guard_test.cpp
+++ b/libc/test/src/compiler/stack_chk_guard_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-macros/signal-macros.h"
+#include "hdr/signal_macros.h"
 #include "src/__support/macros/sanitizer.h"
 #include "src/compiler/__stack_chk_fail.h"
 #include "src/string/memset.h"
@@ -18,7 +18,7 @@ TEST(LlvmLibcStackChkFail, Death) {
 
 // Disable the test when asan is enabled so that it doesn't immediately fail
 // after the memset, but before the stack canary is re-checked.
-#ifndef LIBC_HAVE_ADDRESS_SANITIZER
+#ifndef LIBC_HAS_ADDRESS_SANITIZER
 TEST(LlvmLibcStackChkFail, Smash) {
   EXPECT_DEATH(
       [] {
@@ -27,4 +27,4 @@ TEST(LlvmLibcStackChkFail, Smash) {
       },
       WITH_SIGNAL(SIGABRT));
 }
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 7271e93..e943d98 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -2895,9 +2895,10 @@ add_fp_unittest(
   SRCS
     nanf_test.cpp
   DEPENDS
-    libc.include.signal
+    libc.hdr.signal_macros
     libc.src.math.nanf
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.sanitizer
   # FIXME: The nan tests currently have death tests, which aren't supported for
   # hermetic tests.
   UNIT_TEST_ONLY
@@ -2910,9 +2911,10 @@ add_fp_unittest(
   SRCS
     nan_test.cpp
   DEPENDS
-    libc.include.signal
+    libc.hdr.signal_macros
     libc.src.math.nan
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.sanitizer
   # FIXME: The nan tests currently have death tests, which aren't supported for
   # hermetic tests.
   UNIT_TEST_ONLY
@@ -2925,9 +2927,10 @@ add_fp_unittest(
   SRCS
     nanl_test.cpp
   DEPENDS
-    libc.include.signal
+    libc.hdr.signal_macros
     libc.src.math.nanl
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.sanitizer
   # FIXME: The nan tests currently have death tests, which aren't supported for
   # hermetic tests.
   UNIT_TEST_ONLY
@@ -2940,7 +2943,7 @@ add_fp_unittest(
   SRCS
     nanf16_test.cpp
   DEPENDS
-    libc.include.signal
+    libc.hdr.signal_macros
     libc.src.math.nanf16
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.macros.sanitizer
@@ -2956,9 +2959,10 @@ add_fp_unittest(
   SRCS
     nanf128_test.cpp
   DEPENDS
-    libc.include.signal
+    libc.hdr.signal_macros
     libc.src.math.nanf128
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.sanitizer
   # FIXME: The nan tests currently have death tests, which aren't supported for
   # hermetic tests.
   UNIT_TEST_ONLY
diff --git a/libc/test/src/math/smoke/nan_test.cpp b/libc/test/src/math/smoke/nan_test.cpp
index 68c8441..46b9e9a 100644
--- a/libc/test/src/math/smoke/nan_test.cpp
+++ b/libc/test/src/math/smoke/nan_test.cpp
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/signal_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/sanitizer.h"
 #include "src/math/nan.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <signal.h>
 
 class LlvmLibcNanTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
 public:
@@ -43,8 +44,8 @@ TEST_F(LlvmLibcNanTest, RandomString) {
   run_test("123 ", 0x7ff8000000000000);
 }
 
-#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
+#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
 TEST_F(LlvmLibcNanTest, InvalidInput) {
   EXPECT_DEATH([] { LIBC_NAMESPACE::nan(nullptr); }, WITH_SIGNAL(SIGSEGV));
 }
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/nanf128_test.cpp b/libc/test/src/math/smoke/nanf128_test.cpp
index 015cc31..25dd2ef 100644
--- a/libc/test/src/math/smoke/nanf128_test.cpp
+++ b/libc/test/src/math/smoke/nanf128_test.cpp
@@ -6,7 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/signal_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/sanitizer.h"
 #include "src/__support/uint128.h"
 #include "src/math/nanf128.h"
 #include "test/UnitTest/FEnvSafeTest.h"
@@ -53,9 +55,8 @@ TEST_F(LlvmLibcNanf128Test, RandomString) {
            QUIET_NAN);
 }
 
-#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
-#include <signal.h>
+#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
 TEST_F(LlvmLibcNanf128Test, InvalidInput) {
   EXPECT_DEATH([] { LIBC_NAMESPACE::nanf128(nullptr); }, WITH_SIGNAL(SIGSEGV));
 }
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/nanf16_test.cpp b/libc/test/src/math/smoke/nanf16_test.cpp
index 81b844b..ec640a3 100644
--- a/libc/test/src/math/smoke/nanf16_test.cpp
+++ b/libc/test/src/math/smoke/nanf16_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/signal_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/sanitizer.h"
 #include "src/math/nanf16.h"
@@ -13,8 +14,6 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <signal.h>
-
 class LlvmLibcNanf16Test : public LIBC_NAMESPACE::testing::FEnvSafeTest {
 public:
   using StorageType = LIBC_NAMESPACE::fputil::FPBits<float16>::StorageType;
@@ -44,8 +43,8 @@ TEST_F(LlvmLibcNanf16Test, RandomString) {
   run_test("123 ", 0x7e00);
 }
 
-#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
+#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
 TEST_F(LlvmLibcNanf16Test, InvalidInput) {
   EXPECT_DEATH([] { LIBC_NAMESPACE::nanf16(nullptr); }, WITH_SIGNAL(SIGSEGV));
 }
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/nanf_test.cpp b/libc/test/src/math/smoke/nanf_test.cpp
index ff58236..dd3124e 100644
--- a/libc/test/src/math/smoke/nanf_test.cpp
+++ b/libc/test/src/math/smoke/nanf_test.cpp
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/signal_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/sanitizer.h"
 #include "src/math/nanf.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <signal.h>
 
 class LlvmLibcNanfTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
 public:
@@ -42,8 +43,8 @@ TEST_F(LlvmLibcNanfTest, RandomString) {
   run_test("123 ", 0x7fc00000);
 }
 
-#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
+#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
 TEST_F(LlvmLibcNanfTest, InvalidInput) {
   EXPECT_DEATH([] { LIBC_NAMESPACE::nanf(nullptr); }, WITH_SIGNAL(SIGSEGV));
 }
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/nanl_test.cpp b/libc/test/src/math/smoke/nanl_test.cpp
index de9af05..ef3f9c1 100644
--- a/libc/test/src/math/smoke/nanl_test.cpp
+++ b/libc/test/src/math/smoke/nanl_test.cpp
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/signal_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/sanitizer.h"
 #include "src/math/nanl.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <signal.h>
 
 #if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64)
 #define SELECT_LONG_DOUBLE(val, _, __) val
@@ -70,8 +71,8 @@ TEST_F(LlvmLibcNanlTest, RandomString) {
   run_test("123 ", expected);
 }
 
-#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
+#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
 TEST_F(LlvmLibcNanlTest, InvalidInput) {
   EXPECT_DEATH([] { LIBC_NAMESPACE::nanl(nullptr); }, WITH_SIGNAL(SIGSEGV));
 }
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index a571832..ffff811 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -925,7 +925,6 @@ set(files
   exception
   execution
   expected
-  experimental/__config
   experimental/__simd/aligned_tag.h
   experimental/__simd/declaration.h
   experimental/__simd/reference.h
diff --git a/libcxx/include/__config b/libcxx/include/__config
index b0a5dda..f0a9243 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -591,6 +591,15 @@ typedef __char32_t char32_t;
                                inline namespace _LIBCPP_ABI_NAMESPACE {
 #  define _LIBCPP_END_NAMESPACE_STD }} _LIBCPP_POP_EXTENSION_DIAGNOSTICS
 
+#define _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL namespace std { namespace experimental {
+#define _LIBCPP_END_NAMESPACE_EXPERIMENTAL }}
+
+#define _LIBCPP_BEGIN_NAMESPACE_LFTS _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v1 {
+#define _LIBCPP_END_NAMESPACE_LFTS } _LIBCPP_END_NAMESPACE_EXPERIMENTAL
+
+#define _LIBCPP_BEGIN_NAMESPACE_LFTS_V2 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v2 {
+#define _LIBCPP_END_NAMESPACE_LFTS_V2 } _LIBCPP_END_NAMESPACE_EXPERIMENTAL
+
 #ifdef _LIBCPP_ABI_NO_FILESYSTEM_INLINE_NAMESPACE
 #  define _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM _LIBCPP_BEGIN_NAMESPACE_STD namespace filesystem {
 #  define _LIBCPP_END_NAMESPACE_FILESYSTEM } _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__pstl/backend.h b/libcxx/include/__pstl/backend.h
index 86d9f28..5980b07 100644
--- a/libcxx/include/__pstl/backend.h
+++ b/libcxx/include/__pstl/backend.h
@@ -19,16 +19,20 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
-#if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
-#  include <__pstl/backends/default.h>
-#  include <__pstl/backends/serial.h>
-#elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
-#  include <__pstl/backends/default.h>
-#  include <__pstl/backends/std_thread.h>
-#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
-#  include <__pstl/backends/default.h>
-#  include <__pstl/backends/libdispatch.h>
-#endif
+#if _LIBCPP_STD_VER >= 17
+
+#  if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
+#    include <__pstl/backends/default.h>
+#    include <__pstl/backends/serial.h>
+#  elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
+#    include <__pstl/backends/default.h>
+#    include <__pstl/backends/std_thread.h>
+#  elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+#    include <__pstl/backends/default.h>
+#    include <__pstl/backends/libdispatch.h>
+#  endif
+
+#endif // _LIBCPP_STD_VER >= 17
 
 _LIBCPP_POP_MACROS
 
diff --git a/libcxx/include/__pstl/backend_fwd.h b/libcxx/include/__pstl/backend_fwd.h
index 32c5da5..2132e8d 100644
--- a/libcxx/include/__pstl/backend_fwd.h
+++ b/libcxx/include/__pstl/backend_fwd.h
@@ -39,6 +39,8 @@ _LIBCPP_PUSH_MACROS
 // the user.
 //
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -50,18 +52,18 @@ struct __libdispatch_backend_tag;
 struct __serial_backend_tag;
 struct __std_thread_backend_tag;
 
-#if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
+#  if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
 using __current_configuration = __backend_configuration<__serial_backend_tag, __default_backend_tag>;
-#elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
+#  elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
 using __current_configuration = __backend_configuration<__std_thread_backend_tag, __default_backend_tag>;
-#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+#  elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
 using __current_configuration = __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
-#else
+#  else
 
 // ...New vendors can add parallel backends here...
 
-#  error "Invalid PSTL backend configuration"
-#endif
+#    error "Invalid PSTL backend configuration"
+#  endif
 
 template <class _Backend, class _ExecutionPolicy>
 struct __find_if;
@@ -296,6 +298,8 @@ struct __reduce;
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_BACKEND_FWD_H
diff --git a/libcxx/include/__pstl/backends/default.h b/libcxx/include/__pstl/backends/default.h
index b655da5..3672bbf 100644
--- a/libcxx/include/__pstl/backends/default.h
+++ b/libcxx/include/__pstl/backends/default.h
@@ -33,6 +33,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -498,6 +500,8 @@ struct __rotate_copy<__default_backend_tag, _ExecutionPolicy> {
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_BACKENDS_DEFAULT_H
diff --git a/libcxx/include/__pstl/backends/libdispatch.h b/libcxx/include/__pstl/backends/libdispatch.h
index a0c3ad9..2d6ab49 100644
--- a/libcxx/include/__pstl/backends/libdispatch.h
+++ b/libcxx/include/__pstl/backends/libdispatch.h
@@ -44,6 +44,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -140,15 +142,15 @@ struct __cpu_traits<__libdispatch_backend_tag> {
 
     unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
         [&]() -> __merge_range_t* {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
           try {
-#endif
+#  endif
             return std::allocator<__merge_range_t>().allocate(__n_ranges);
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
           } catch (const std::bad_alloc&) {
             return nullptr;
           }
-#endif
+#  endif
         }(),
         __destroy);
 
@@ -392,6 +394,8 @@ struct __fill<__libdispatch_backend_tag, _ExecutionPolicy>
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
diff --git a/libcxx/include/__pstl/backends/serial.h b/libcxx/include/__pstl/backends/serial.h
index 5f24499..f414201 100644
--- a/libcxx/include/__pstl/backends/serial.h
+++ b/libcxx/include/__pstl/backends/serial.h
@@ -30,6 +30,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -176,6 +178,8 @@ struct __transform_reduce_binary<__serial_backend_tag, _ExecutionPolicy> {
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_BACKENDS_SERIAL_H
diff --git a/libcxx/include/__pstl/backends/std_thread.h b/libcxx/include/__pstl/backends/std_thread.h
index 49570bd..19b985f 100644
--- a/libcxx/include/__pstl/backends/std_thread.h
+++ b/libcxx/include/__pstl/backends/std_thread.h
@@ -32,6 +32,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -131,6 +133,8 @@ struct __fill<__std_thread_backend_tag, _ExecutionPolicy>
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_BACKENDS_STD_THREAD_H
diff --git a/libcxx/include/__pstl/cpu_algos/any_of.h b/libcxx/include/__pstl/cpu_algos/any_of.h
index b33c787..803db79 100644
--- a/libcxx/include/__pstl/cpu_algos/any_of.h
+++ b/libcxx/include/__pstl/cpu_algos/any_of.h
@@ -26,6 +26,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -94,6 +96,8 @@ struct __cpu_parallel_any_of {
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_CPU_ALGOS_ANY_OF_H
diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
index 0483d691..5e59752 100644
--- a/libcxx/include/__pstl/cpu_algos/cpu_traits.h
+++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
@@ -19,6 +19,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -81,6 +83,8 @@ struct __cpu_traits;
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H
diff --git a/libcxx/include/__pstl/cpu_algos/fill.h b/libcxx/include/__pstl/cpu_algos/fill.h
index 4e6d29b..3e59365 100644
--- a/libcxx/include/__pstl/cpu_algos/fill.h
+++ b/libcxx/include/__pstl/cpu_algos/fill.h
@@ -23,6 +23,8 @@
 #  pragma GCC system_header
 #endif
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -63,4 +65,6 @@ struct __cpu_parallel_fill {
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 #endif // _LIBCPP___PSTL_CPU_ALGOS_FILL_H
diff --git a/libcxx/include/__pstl/cpu_algos/find_if.h b/libcxx/include/__pstl/cpu_algos/find_if.h
index 12b2e88..cd92e5a 100644
--- a/libcxx/include/__pstl/cpu_algos/find_if.h
+++ b/libcxx/include/__pstl/cpu_algos/find_if.h
@@ -31,6 +31,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -132,6 +134,8 @@ struct __cpu_parallel_find_if {
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_CPU_ALGOS_FIND_IF_H
diff --git a/libcxx/include/__pstl/cpu_algos/for_each.h b/libcxx/include/__pstl/cpu_algos/for_each.h
index d4d7862..cec719b 100644
--- a/libcxx/include/__pstl/cpu_algos/for_each.h
+++ b/libcxx/include/__pstl/cpu_algos/for_each.h
@@ -23,6 +23,8 @@
 #  pragma GCC system_header
 #endif
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -63,4 +65,6 @@ struct __cpu_parallel_for_each {
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 #endif // _LIBCPP___PSTL_CPU_ALGOS_FOR_EACH_H
diff --git a/libcxx/include/__pstl/cpu_algos/merge.h b/libcxx/include/__pstl/cpu_algos/merge.h
index dfa4cbf..a9069ca 100644
--- a/libcxx/include/__pstl/cpu_algos/merge.h
+++ b/libcxx/include/__pstl/cpu_algos/merge.h
@@ -26,6 +26,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -80,6 +82,8 @@ struct __cpu_parallel_merge {
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_CPU_ALGOS_MERGE_H
diff --git a/libcxx/include/__pstl/cpu_algos/stable_sort.h b/libcxx/include/__pstl/cpu_algos/stable_sort.h
index 8e64f3e..5afdd3f 100644
--- a/libcxx/include/__pstl/cpu_algos/stable_sort.h
+++ b/libcxx/include/__pstl/cpu_algos/stable_sort.h
@@ -21,6 +21,8 @@
 #  pragma GCC system_header
 #endif
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -44,4 +46,6 @@ struct __cpu_parallel_stable_sort {
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 #endif // _LIBCPP___PSTL_CPU_ALGOS_STABLE_SORT_H
diff --git a/libcxx/include/__pstl/cpu_algos/transform.h b/libcxx/include/__pstl/cpu_algos/transform.h
index 27ce8e2..979121b 100644
--- a/libcxx/include/__pstl/cpu_algos/transform.h
+++ b/libcxx/include/__pstl/cpu_algos/transform.h
@@ -27,6 +27,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -148,6 +150,8 @@ struct __cpu_parallel_transform_binary {
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_H
diff --git a/libcxx/include/__pstl/cpu_algos/transform_reduce.h b/libcxx/include/__pstl/cpu_algos/transform_reduce.h
index 36ac1a9..aafbf1c 100644
--- a/libcxx/include/__pstl/cpu_algos/transform_reduce.h
+++ b/libcxx/include/__pstl/cpu_algos/transform_reduce.h
@@ -31,6 +31,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -211,6 +213,8 @@ struct __cpu_parallel_transform_reduce {
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_REDUCE_H
diff --git a/libcxx/include/__pstl/dispatch.h b/libcxx/include/__pstl/dispatch.h
index 5e903f7..ea40fa7 100644
--- a/libcxx/include/__pstl/dispatch.h
+++ b/libcxx/include/__pstl/dispatch.h
@@ -23,6 +23,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -61,6 +63,8 @@ using __dispatch = typename __find_first_implemented<_Algorithm, _BackendConfigu
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_DISPATCH_H
diff --git a/libcxx/include/__pstl/handle_exception.h b/libcxx/include/__pstl/handle_exception.h
index d627095..57dfcfd 100644
--- a/libcxx/include/__pstl/handle_exception.h
+++ b/libcxx/include/__pstl/handle_exception.h
@@ -22,6 +22,8 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+#if _LIBCPP_STD_VER >= 17
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
@@ -52,6 +54,8 @@ _LIBCPP_HIDE_FROM_ABI auto __handle_exception(_Args&&... __args) {
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_STD_VER >= 17
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___PSTL_HANDLE_EXCEPTION_H
diff --git a/libcxx/include/experimental/__config b/libcxx/include/experimental/__config
deleted file mode 100644
index 7b23791..0000000
--- a/libcxx/include/experimental/__config
+++ /dev/null
@@ -1,45 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP_EXPERIMENTAL_CONFIG
-#define _LIBCPP_EXPERIMENTAL_CONFIG
-
-#include <__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#define _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL                                                                           \
-  namespace std {                                                                                                      \
-  namespace experimental {
-#define _LIBCPP_END_NAMESPACE_EXPERIMENTAL                                                                             \
-  }                                                                                                                    \
-  }
-
-#define _LIBCPP_BEGIN_NAMESPACE_LFTS _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v1 {
-#define _LIBCPP_END_NAMESPACE_LFTS                                                                                     \
-  }                                                                                                                    \
-  }                                                                                                                    \
-  }
-
-#define _LIBCPP_BEGIN_NAMESPACE_LFTS_V2 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v2 {
-#define _LIBCPP_END_NAMESPACE_LFTS_V2                                                                                  \
-  }                                                                                                                    \
-  }                                                                                                                    \
-  }
-
-// TODO: support more targets
-#if defined(__AVX__)
-#  define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 32
-#else
-#  define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 16
-#endif
-
-#endif
diff --git a/libcxx/include/experimental/__simd/aligned_tag.h b/libcxx/include/experimental/__simd/aligned_tag.h
index 31d2b50..e364e14 100644
--- a/libcxx/include/experimental/__simd/aligned_tag.h
+++ b/libcxx/include/experimental/__simd/aligned_tag.h
@@ -10,10 +10,10 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H
 
+#include <__config>
 #include <__memory/assume_aligned.h>
 #include <__type_traits/remove_const.h>
 #include <cstddef>
-#include <experimental/__config>
 #include <experimental/__simd/traits.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
diff --git a/libcxx/include/experimental/__simd/declaration.h b/libcxx/include/experimental/__simd/declaration.h
index 7b45d03..2ac7224 100644
--- a/libcxx/include/experimental/__simd/declaration.h
+++ b/libcxx/include/experimental/__simd/declaration.h
@@ -10,11 +10,18 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H
 
+#include <__config>
 #include <cstddef>
-#include <experimental/__config>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
 
+// TODO: support more targets
+#  if defined(__AVX__)
+#    define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 32
+#  else
+#    define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 16
+#  endif
+
 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
 inline namespace parallelism_v2 {
 namespace simd_abi {
diff --git a/libcxx/include/experimental/__simd/reference.h b/libcxx/include/experimental/__simd/reference.h
index c60c08b..cba460b 100644
--- a/libcxx/include/experimental/__simd/reference.h
+++ b/libcxx/include/experimental/__simd/reference.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H
 
+#include <__config>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_same.h>
@@ -17,7 +18,6 @@
 #include <__utility/forward.h>
 #include <__utility/move.h>
 #include <cstddef>
-#include <experimental/__config>
 #include <experimental/__simd/utility.h>
 
 _LIBCPP_PUSH_MACROS
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index a2aeeb5..a76933e 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -11,9 +11,9 @@
 #define _LIBCPP_EXPERIMENTAL___SIMD_SCALAR_H
 
 #include <__assert>
+#include <__config>
 #include <__type_traits/integral_constant.h>
 #include <cstddef>
-#include <experimental/__config>
 #include <experimental/__simd/declaration.h>
 #include <experimental/__simd/traits.h>
 
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index db0f9b3..2c65d19 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -10,12 +10,12 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H
 
+#include <__config>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/forward.h>
 #include <cstddef>
-#include <experimental/__config>
 #include <experimental/__simd/declaration.h>
 #include <experimental/__simd/reference.h>
 #include <experimental/__simd/traits.h>
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index d54d489..5527319 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -10,10 +10,10 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H
 
+#include <__config>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_same.h>
 #include <cstddef>
-#include <experimental/__config>
 #include <experimental/__simd/declaration.h>
 #include <experimental/__simd/reference.h>
 #include <experimental/__simd/traits.h>
diff --git a/libcxx/include/experimental/__simd/traits.h b/libcxx/include/experimental/__simd/traits.h
index ec25b4b..b817df60 100644
--- a/libcxx/include/experimental/__simd/traits.h
+++ b/libcxx/include/experimental/__simd/traits.h
@@ -11,10 +11,10 @@
 #define _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H
 
 #include <__bit/bit_ceil.h>
+#include <__config>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_same.h>
 #include <cstddef>
-#include <experimental/__config>
 #include <experimental/__simd/declaration.h>
 #include <experimental/__simd/utility.h>
 
diff --git a/libcxx/include/experimental/__simd/utility.h b/libcxx/include/experimental/__simd/utility.h
index 708fa3d..0103b06 100644
--- a/libcxx/include/experimental/__simd/utility.h
+++ b/libcxx/include/experimental/__simd/utility.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_UTILITY_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_UTILITY_H
 
+#include <__config>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_const.h>
 #include <__type_traits/is_constant_evaluated.h>
@@ -22,7 +23,6 @@
 #include <__utility/integer_sequence.h>
 #include <cstddef>
 #include <cstdint>
-#include <experimental/__config>
 #include <limits>
 
 _LIBCPP_PUSH_MACROS
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index 5787f23..1f707cf 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -12,11 +12,11 @@
 
 #include <__assert>
 #include <__bit/bit_ceil.h>
+#include <__config>
 #include <__type_traits/integral_constant.h>
 #include <__utility/forward.h>
 #include <__utility/integer_sequence.h>
 #include <cstddef>
-#include <experimental/__config>
 #include <experimental/__simd/declaration.h>
 #include <experimental/__simd/traits.h>
 #include <experimental/__simd/utility.h>
diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator
index de82da2..edfe6e7 100644
--- a/libcxx/include/experimental/iterator
+++ b/libcxx/include/experimental/iterator
@@ -52,11 +52,11 @@ namespace std {
 
 */
 
+#include <__config>
 #include <__memory/addressof.h>
 #include <__type_traits/decay.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
-#include <experimental/__config>
 #include <iterator>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/experimental/memory b/libcxx/include/experimental/memory
index e9663d4..bf8a154 100644
--- a/libcxx/include/experimental/memory
+++ b/libcxx/include/experimental/memory
@@ -49,6 +49,7 @@ public:
 }
 */
 
+#include <__config>
 #include <__functional/hash.h>
 #include <__functional/operations.h>
 #include <__type_traits/add_lvalue_reference.h>
@@ -57,7 +58,6 @@ public:
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_convertible.h>
 #include <cstddef>
-#include <experimental/__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/experimental/propagate_const b/libcxx/include/experimental/propagate_const
index d7a695d..510d374 100644
--- a/libcxx/include/experimental/propagate_const
+++ b/libcxx/include/experimental/propagate_const
@@ -107,6 +107,7 @@
 
 */
 
+#include <__config>
 #include <__functional/operations.h>
 #include <__fwd/functional.h>
 #include <__type_traits/conditional.h>
@@ -128,7 +129,6 @@
 #include <__utility/move.h>
 #include <__utility/swap.h>
 #include <cstddef>
-#include <experimental/__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/experimental/simd b/libcxx/include/experimental/simd
index 484543b..35120b4 100644
--- a/libcxx/include/experimental/simd
+++ b/libcxx/include/experimental/simd
@@ -75,7 +75,7 @@ inline namespace parallelism_v2 {
 #  pragma GCC system_header
 #endif
 
-#include <experimental/__config>
+#include <__config>
 #include <experimental/__simd/aligned_tag.h>
 #include <experimental/__simd/declaration.h>
 #include <experimental/__simd/reference.h>
diff --git a/libcxx/include/experimental/type_traits b/libcxx/include/experimental/type_traits
index 31b041b..a4bb59a 100644
--- a/libcxx/include/experimental/type_traits
+++ b/libcxx/include/experimental/type_traits
@@ -68,7 +68,7 @@ inline namespace fundamentals_v1 {
 
  */
 
-#include <experimental/__config>
+#include <__config>
 
 #if _LIBCPP_STD_VER >= 14
 
diff --git a/libcxx/include/experimental/utility b/libcxx/include/experimental/utility
index 8bd0a05..cbc7ad1 100644
--- a/libcxx/include/experimental/utility
+++ b/libcxx/include/experimental/utility
@@ -30,7 +30,7 @@ inline namespace fundamentals_v1 {
 
  */
 
-#include <experimental/__config>
+#include <__config>
 #include <utility>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 65df579..add8726d 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -610,10 +610,6 @@ module std_experimental [system] {
     header "experimental/utility"
     export *
   }
-  module __config {
-    textual header "experimental/__config"
-    export *
-  }
 }
 
 // Convenience method to get all of the above modules in a single import statement.
diff --git a/libcxx/src/any.cpp b/libcxx/src/any.cpp
index b0ad695..eaca2dd 100644
--- a/libcxx/src/any.cpp
+++ b/libcxx/src/any.cpp
@@ -12,7 +12,7 @@ namespace std {
 const char* bad_any_cast::what() const noexcept { return "bad any cast"; }
 } // namespace std
 
-#include <experimental/__config>
+#include <__config>
 
 //  Preserve std::experimental::any_bad_cast for ABI compatibility
 //  Even though it no longer exists in a header file
diff --git a/libcxx/src/optional.cpp b/libcxx/src/optional.cpp
index 62b474a..4e7e288 100644
--- a/libcxx/src/optional.cpp
+++ b/libcxx/src/optional.cpp
@@ -17,7 +17,7 @@ const char* bad_optional_access::what() const noexcept { return "bad_optional_ac
 
 } // namespace std
 
-#include <experimental/__config>
+#include <__config>
 
 //  Preserve std::experimental::bad_optional_access for ABI compatibility
 //  Even though it no longer exists in a header file
diff --git a/lld/test/wasm/static-error.s b/lld/test/wasm/static-error.s
new file mode 100644
index 0000000..3557506
--- /dev/null
+++ b/lld/test/wasm/static-error.s
@@ -0,0 +1,12 @@
+// RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o
+// RUN: wasm-ld --experimental-pic -shared -o %t.so %t.o
+
+// RUN: wasm-ld --experimental-pic -pie -o /dev/null %t.o %t.so
+// RUN: not wasm-ld -o /dev/null -static %t.o %t.so 2>&1 | FileCheck %s
+
+// CHECK: attempted static link of dynamic object
+
+.global _start
+_start:
+  .functype _start () -> ()
+  end_function
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index cb8fe25..2de7dca 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -333,9 +333,15 @@ void LinkerDriver::addFile(StringRef path) {
     return;
   }
   case file_magic::bitcode:
-  case file_magic::wasm_object:
-    files.push_back(createObjectFile(mbref, "", 0, inLib));
+  case file_magic::wasm_object: {
+    auto obj = createObjectFile(mbref, "", 0, inLib);
+    if (config->isStatic && isa<SharedFile>(obj)) {
+      error("attempted static link of dynamic object " + path);
+      break;
+    }
+    files.push_back(obj);
     break;
+  }
   case file_magic::unknown:
     if (mbref.getBuffer().starts_with("#STUB")) {
       files.push_back(make<StubFile>(mbref));
diff --git a/lldb/include/lldb/API/SBMemoryRegionInfo.h b/lldb/include/lldb/API/SBMemoryRegionInfo.h
index be55de4..f9a5dc9 100644
--- a/lldb/include/lldb/API/SBMemoryRegionInfo.h
+++ b/lldb/include/lldb/API/SBMemoryRegionInfo.h
@@ -120,7 +120,7 @@ public:
 private:
   friend class SBProcess;
   friend class SBMemoryRegionInfoList;
-
+  friend class SBSaveCoreOptions;
   friend class lldb_private::ScriptInterpreter;
 
   lldb_private::MemoryRegionInfo &ref();
diff --git a/lldb/include/lldb/API/SBSaveCoreOptions.h b/lldb/include/lldb/API/SBSaveCoreOptions.h
index ba48ba5..c076d3c 100644
--- a/lldb/include/lldb/API/SBSaveCoreOptions.h
+++ b/lldb/include/lldb/API/SBSaveCoreOptions.h
@@ -80,6 +80,17 @@ public:
   /// \return True if the thread was removed, false if it was not in the list.
   bool RemoveThread(lldb::SBThread thread);
 
+  /// Add a memory region to save in the core file.
+  ///
+  /// \param region The memory region to save.
+  /// \returns An empty SBError upon success, or an error if the region is
+  /// invalid.
+  /// \note Ranges that overlapped will be unioned into a single region, this
+  /// also supercedes stack minification. Specifying full regions and a
+  /// non-custom core style will include the specified regions and union them
+  /// with all style specific regions.
+  SBError AddMemoryRegionToSave(const SBMemoryRegionInfo &region);
+
   /// Reset all options.
   void Clear();
 
diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h
index 172824d..e386271 100644
--- a/lldb/include/lldb/Core/SourceManager.h
+++ b/lldb/include/lldb/Core/SourceManager.h
@@ -74,6 +74,10 @@ public:
 
     const Checksum &GetChecksum() const { return m_checksum; }
 
+    llvm::once_flag &GetChecksumWarningOnceFlag() {
+      return m_checksum_warning_once_flag;
+    }
+
   protected:
     /// Set file and update modification time.
     void SetSupportFile(lldb::SupportFileSP support_file_sp);
@@ -87,6 +91,9 @@ public:
     /// Keep track of the on-disk checksum.
     Checksum m_checksum;
 
+    /// Once flag for emitting a checksum mismatch warning.
+    llvm::once_flag m_checksum_warning_once_flag;
+
     // Keep the modification time that this file data is valid for
     llvm::sys::TimePoint<> m_mod_time;
 
diff --git a/lldb/include/lldb/Symbol/SaveCoreOptions.h b/lldb/include/lldb/Symbol/SaveCoreOptions.h
index f4fed46..d90d080 100644
--- a/lldb/include/lldb/Symbol/SaveCoreOptions.h
+++ b/lldb/include/lldb/Symbol/SaveCoreOptions.h
@@ -10,13 +10,15 @@
 #define LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H
 
 #include "lldb/Utility/FileSpec.h"
-#include "lldb/lldb-forward.h"
-#include "lldb/lldb-types.h"
+#include "lldb/Utility/RangeMap.h"
 
 #include <optional>
+#include <set>
 #include <string>
 #include <unordered_set>
 
+using MemoryRanges = lldb_private::RangeVector<lldb::addr_t, lldb::addr_t>;
+
 namespace lldb_private {
 
 class SaveCoreOptions {
@@ -38,8 +40,12 @@ public:
   Status AddThread(lldb::ThreadSP thread_sp);
   bool RemoveThread(lldb::ThreadSP thread_sp);
   bool ShouldThreadBeSaved(lldb::tid_t tid) const;
+  bool HasSpecifiedThreads() const;
 
   Status EnsureValidConfiguration(lldb::ProcessSP process_sp) const;
+  const MemoryRanges &GetCoreFileMemoryRanges() const;
+
+  void AddMemoryRegionToSave(const lldb_private::MemoryRegionInfo &region);
 
   void Clear();
 
@@ -51,6 +57,7 @@ private:
   std::optional<lldb::SaveCoreStyle> m_style;
   lldb::ProcessSP m_process_sp;
   std::unordered_set<lldb::tid_t> m_threads_to_save;
+  MemoryRanges m_regions_to_save;
 };
 } // namespace lldb_private
 
diff --git a/lldb/include/lldb/Target/CoreFileMemoryRanges.h b/lldb/include/lldb/Target/CoreFileMemoryRanges.h
new file mode 100644
index 0000000..503ecd6
--- /dev/null
+++ b/lldb/include/lldb/Target/CoreFileMemoryRanges.h
@@ -0,0 +1,50 @@
+//===-- CoreFileMemoryRanges.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Utility/RangeMap.h"
+#include "lldb/Utility/Status.h"
+
+#include "llvm/ADT/AddressRanges.h"
+
+#ifndef LLDB_TARGET_COREFILEMEMORYRANGES_H
+#define LLDB_TARGET_COREFILEMEMORYRANGES_H
+
+namespace lldb_private {
+
+struct CoreFileMemoryRange {
+  llvm::AddressRange range;  /// The address range to save into the core file.
+  uint32_t lldb_permissions; /// A bit set of lldb::Permissions bits.
+
+  bool operator==(const CoreFileMemoryRange &rhs) const {
+    return range == rhs.range && lldb_permissions == rhs.lldb_permissions;
+  }
+
+  bool operator!=(const CoreFileMemoryRange &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator<(const CoreFileMemoryRange &rhs) const {
+    if (range < rhs.range)
+      return true;
+    if (range == rhs.range)
+      return lldb_permissions < rhs.lldb_permissions;
+    return false;
+  }
+};
+
+class CoreFileMemoryRanges
+    : public lldb_private::RangeDataVector<lldb::addr_t, lldb::addr_t,
+                                           CoreFileMemoryRange> {
+public:
+  /// Finalize and merge all overlapping ranges in this collection. Ranges
+  /// will be seperated based on permissions.
+  Status FinalizeCoreFileSaveRanges();
+};
+} // namespace lldb_private
+
+#endif // LLDB_TARGET_COREFILEMEMORYRANGES_H
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index c66cfb2..b8c53a4 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -35,6 +35,8 @@
 #include "lldb/Host/ProcessLaunchInfo.h"
 #include "lldb/Host/ProcessRunLock.h"
 #include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
+#include "lldb/Target/CoreFileMemoryRanges.h"
 #include "lldb/Target/ExecutionContextScope.h"
 #include "lldb/Target/InstrumentationRuntime.h"
 #include "lldb/Target/Memory.h"
@@ -710,29 +712,6 @@ public:
   ///     is not supported by the plugin, error otherwise.
   virtual llvm::Expected<bool> SaveCore(llvm::StringRef outfile);
 
-  struct CoreFileMemoryRange {
-    llvm::AddressRange range;  /// The address range to save into the core file.
-    uint32_t lldb_permissions; /// A bit set of lldb::Permissions bits.
-
-    bool operator==(const CoreFileMemoryRange &rhs) const {
-      return range == rhs.range && lldb_permissions == rhs.lldb_permissions;
-    }
-
-    bool operator!=(const CoreFileMemoryRange &rhs) const {
-      return !(*this == rhs);
-    }
-
-    bool operator<(const CoreFileMemoryRange &rhs) const {
-      if (range < rhs.range)
-        return true;
-      if (range == rhs.range)
-        return lldb_permissions < rhs.lldb_permissions;
-      return false;
-    }
-  };
-
-  using CoreFileMemoryRanges = std::vector<CoreFileMemoryRange>;
-
   /// Helper function for Process::SaveCore(...) that calculates the address
   /// ranges that should be saved. This allows all core file plug-ins to save
   /// consistent memory ranges given a \a core_style.
diff --git a/lldb/include/lldb/Utility/RangeMap.h b/lldb/include/lldb/Utility/RangeMap.h
index 8cc382b..433466e 100644
--- a/lldb/include/lldb/Utility/RangeMap.h
+++ b/lldb/include/lldb/Utility/RangeMap.h
@@ -450,6 +450,12 @@ public:
 
   void Append(const Entry &entry) { m_entries.emplace_back(entry); }
 
+  /// Append a range with data to the vector
+  /// \param B The base of the memory range
+  /// \param S The size of the memory range
+  /// \param T The data associated with the memory range
+  void Append(B &&b, S &&s, T &&t) { m_entries.emplace_back(Entry(b, s, t)); }
+
   bool Erase(uint32_t start, uint32_t end) {
     if (start >= end || end > m_entries.size())
       return false;
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 7bfde8b..938f6e3 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -1222,6 +1222,7 @@ enum SaveCoreStyle {
   eSaveCoreFull = 1,
   eSaveCoreDirtyOnly = 2,
   eSaveCoreStackOnly = 3,
+  eSaveCoreCustomOnly = 4,
 };
 
 /// Events that might happen during a trace session.
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index 337eff6..5fb288a 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -207,6 +207,7 @@ class StackFrameRecognizer;
 class StackFrameRecognizerManager;
 class StackID;
 class Status;
+class SaveCoreOptions;
 class StopInfo;
 class Stoppoint;
 class StoppointCallbackContext;
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index b3c8cda..5bac5cd 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -9,7 +9,6 @@
 #ifndef LLDB_LLDB_PRIVATE_INTERFACES_H
 #define LLDB_LLDB_PRIVATE_INTERFACES_H
 
-#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/lldb-enumerations.h"
 #include "lldb/lldb-forward.h"
 #include "lldb/lldb-private-enumerations.h"
diff --git a/lldb/source/API/SBSaveCoreOptions.cpp b/lldb/source/API/SBSaveCoreOptions.cpp
index ef82b02..c79b57f 100644
--- a/lldb/source/API/SBSaveCoreOptions.cpp
+++ b/lldb/source/API/SBSaveCoreOptions.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/API/SBSaveCoreOptions.h"
+#include "lldb/API/SBMemoryRegionInfo.h"
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Utility/Instrumentation.h"
@@ -89,6 +90,16 @@ bool SBSaveCoreOptions::RemoveThread(lldb::SBThread thread) {
   return m_opaque_up->RemoveThread(thread.GetSP());
 }
 
+lldb::SBError
+SBSaveCoreOptions::AddMemoryRegionToSave(const SBMemoryRegionInfo &region) {
+  LLDB_INSTRUMENT_VA(this, region);
+  // Currently add memory region can't fail, so we always return a success
+  // SBerror, but because these API's live forever, this is the most future
+  // proof thing to do.
+  m_opaque_up->AddMemoryRegionToSave(region.ref());
+  return SBError();
+}
+
 void SBSaveCoreOptions::Clear() {
   LLDB_INSTRUMENT_VA(this);
   m_opaque_up->Clear();
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index 25eb633..5b0f4f6 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -25,6 +25,7 @@
 #include "lldb/Interpreter/OptionArgParser.h"
 #include "lldb/Interpreter/OptionGroupPythonClassWithDict.h"
 #include "lldb/Interpreter/Options.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Target/Platform.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/StopInfo.h"
diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp
index f97d86a..fd5b499 100644
--- a/lldb/source/Core/SourceManager.cpp
+++ b/lldb/source/Core/SourceManager.cpp
@@ -61,6 +61,12 @@ static void resolve_tilde(FileSpec &file_spec) {
   }
 }
 
+static std::string toString(const Checksum &checksum) {
+  if (!checksum)
+    return "";
+  return std::string(llvm::formatv("{0}", checksum.digest()));
+}
+
 // SourceManager constructor
 SourceManager::SourceManager(const TargetSP &target_sp)
     : m_last_support_file_sp(std::make_shared<SupportFile>()), m_last_line(0),
@@ -302,6 +308,18 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbersUsingLastFile(
         break;
       }
     }
+
+    Checksum line_table_checksum =
+        last_file_sp->GetSupportFile()->GetChecksum();
+    Checksum on_disk_checksum = last_file_sp->GetChecksum();
+    if (line_table_checksum && line_table_checksum != on_disk_checksum)
+      Debugger::ReportWarning(
+          llvm::formatv(
+              "{0}: source file checksum mismatch between line table "
+              "({1}) and file on disk ({2})",
+              last_file_sp->GetSupportFile()->GetSpecOnly().GetFilename(),
+              toString(line_table_checksum), toString(on_disk_checksum)),
+          std::nullopt, &last_file_sp->GetChecksumWarningOnceFlag());
   }
   return *delta;
 }
@@ -837,12 +855,6 @@ SourceManager::FileSP SourceManager::SourceFileCache::FindSourceFile(
   return {};
 }
 
-static std::string toString(const Checksum &checksum) {
-  if (!checksum)
-    return "";
-  return std::string(llvm::formatv("{0}", checksum.digest()));
-}
-
 void SourceManager::SourceFileCache::Dump(Stream &stream) const {
   // clang-format off
   stream << "Modification time   MD5 Checksum (on-disk)           MD5 Checksum (line table)        Lines    Path\n";
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index b28beab..06da83e 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -6562,13 +6562,15 @@ bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp,
     }
 
     if (make_core) {
-      Process::CoreFileMemoryRanges core_ranges;
+      CoreFileMemoryRanges core_ranges;
       error = process_sp->CalculateCoreFileSaveRanges(options, core_ranges);
       if (error.Success()) {
         const uint32_t addr_byte_size = target_arch.GetAddressByteSize();
         const ByteOrder byte_order = target_arch.GetByteOrder();
         std::vector<llvm::MachO::segment_command_64> segment_load_commands;
-        for (const auto &core_range : core_ranges) {
+        for (const auto &core_range_info : core_ranges) {
+          // TODO: Refactor RangeDataVector to have a data iterator.
+          const auto &core_range = core_range_info.data;
           uint32_t cmd_type = LC_SEGMENT_64;
           uint32_t segment_size = sizeof(llvm::MachO::segment_command_64);
           if (addr_byte_size == 4) {
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
index 27bc237..be87112 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
@@ -12,6 +12,7 @@
 #include "lldb/Core/Address.h"
 #include "lldb/Host/SafeMachO.h"
 #include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/FileSpecList.h"
 #include "lldb/Utility/RangeMap.h"
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index 5c9ba22..edc568a 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -831,25 +831,32 @@ Status MinidumpFileBuilder::AddMemoryList() {
   // bytes of the core file. Thread structures in minidump files can only use
   // 32 bit memory descriptiors, so we emit them first to ensure the memory is
   // in accessible with a 32 bit offset.
-  Process::CoreFileMemoryRanges ranges_32;
-  Process::CoreFileMemoryRanges ranges_64;
-  Process::CoreFileMemoryRanges all_core_memory_ranges;
+  std::vector<CoreFileMemoryRange> ranges_32;
+  std::vector<CoreFileMemoryRange> ranges_64;
+  CoreFileMemoryRanges all_core_memory_ranges;
   error = m_process_sp->CalculateCoreFileSaveRanges(m_save_core_options,
                                                     all_core_memory_ranges);
+
+  std::vector<CoreFileMemoryRange> all_core_memory_vec;
+  // Extract all the data into just a vector of data. So we can mutate this in
+  // place.
+  for (const auto &core_range : all_core_memory_ranges)
+    all_core_memory_vec.push_back(core_range.data);
+
   if (error.Fail())
     return error;
 
   // Start by saving all of the stacks and ensuring they fit under the 32b
   // limit.
   uint64_t total_size = GetCurrentDataEndOffset();
-  auto iterator = all_core_memory_ranges.begin();
-  while (iterator != all_core_memory_ranges.end()) {
+  auto iterator = all_core_memory_vec.begin();
+  while (iterator != all_core_memory_vec.end()) {
     if (m_saved_stack_ranges.count(iterator->range.start()) > 0) {
       // We don't save stacks twice.
       ranges_32.push_back(*iterator);
       total_size +=
           iterator->range.size() + sizeof(llvm::minidump::MemoryDescriptor);
-      iterator = all_core_memory_ranges.erase(iterator);
+      iterator = all_core_memory_vec.erase(iterator);
     } else {
       iterator++;
     }
@@ -869,11 +876,11 @@ Status MinidumpFileBuilder::AddMemoryList() {
   // Then anything overflow extends into 64b addressable space.
   // All core memeroy ranges will either container nothing on stacks only
   // or all the memory ranges including stacks
-  if (!all_core_memory_ranges.empty())
-    total_size += 256 + (all_core_memory_ranges.size() *
+  if (!all_core_memory_vec.empty())
+    total_size += 256 + (all_core_memory_vec.size() *
                          sizeof(llvm::minidump::MemoryDescriptor_64));
 
-  for (const auto &core_range : all_core_memory_ranges) {
+  for (const auto &core_range : all_core_memory_vec) {
     const addr_t range_size = core_range.range.size();
     // We don't need to check for stacks here because we already removed them
     // from all_core_memory_ranges.
@@ -958,15 +965,15 @@ Status MinidumpFileBuilder::DumpDirectories() const {
 }
 
 static uint64_t
-GetLargestRangeSize(const Process::CoreFileMemoryRanges &ranges) {
+GetLargestRangeSize(const std::vector<CoreFileMemoryRange> &ranges) {
   uint64_t max_size = 0;
   for (const auto &core_range : ranges)
     max_size = std::max(max_size, core_range.range.size());
   return max_size;
 }
 
-Status
-MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) {
+Status MinidumpFileBuilder::AddMemoryList_32(
+    std::vector<CoreFileMemoryRange> &ranges) {
   std::vector<MemoryDescriptor> descriptors;
   Status error;
   if (ranges.size() == 0)
@@ -1042,8 +1049,8 @@ MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) {
   return error;
 }
 
-Status
-MinidumpFileBuilder::AddMemoryList_64(Process::CoreFileMemoryRanges &ranges) {
+Status MinidumpFileBuilder::AddMemoryList_64(
+    std::vector<CoreFileMemoryRange> &ranges) {
   Status error;
   if (ranges.empty())
     return error;
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
index 762de83..71001e2 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
@@ -23,6 +23,7 @@
 #include <utility>
 #include <variant>
 
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
@@ -120,9 +121,9 @@ private:
   lldb_private::Status AddData(const void *data, uint64_t size);
   // Add MemoryList stream, containing dumps of important memory segments
   lldb_private::Status
-  AddMemoryList_64(lldb_private::Process::CoreFileMemoryRanges &ranges);
+  AddMemoryList_64(std::vector<lldb_private::CoreFileMemoryRange> &ranges);
   lldb_private::Status
-  AddMemoryList_32(lldb_private::Process::CoreFileMemoryRanges &ranges);
+  AddMemoryList_32(std::vector<lldb_private::CoreFileMemoryRange> &ranges);
   // Update the thread list on disk with the newly emitted stack RVAs.
   lldb_private::Status FixThreadStacks();
   lldb_private::Status FlushBufferToDisk();
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
index b76fcd0..2f45f01 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
@@ -21,6 +21,7 @@
 #define LLDB_SOURCE_PLUGINS_OBJECTFILE_MINIDUMP_OBJECTFILEMINIDUMP_H
 
 #include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Utility/ArchSpec.h"
 
 class ObjectFileMinidump : public lldb_private::PluginInterface {
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index 9d01089..8d9c919 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -17,6 +17,7 @@
 #include "lldb/Interpreter/OptionValueDictionary.h"
 #include "lldb/Interpreter/OptionValueProperties.h"
 #include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/SectionLoadList.h"
 #include "lldb/Target/Target.h"
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
index 8bccf3b..4f4dedf 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
@@ -13,6 +13,7 @@
 #include <vector>
 
 #include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "llvm/Object/COFF.h"
 
 class ObjectFilePECOFF : public lldb_private::ObjectFile {
diff --git a/lldb/source/Symbol/SaveCoreOptions.cpp b/lldb/source/Symbol/SaveCoreOptions.cpp
index 35943726..8d9aade 100644
--- a/lldb/source/Symbol/SaveCoreOptions.cpp
+++ b/lldb/source/Symbol/SaveCoreOptions.cpp
@@ -102,6 +102,19 @@ bool SaveCoreOptions::ShouldThreadBeSaved(lldb::tid_t tid) const {
   return m_threads_to_save.count(tid) > 0;
 }
 
+bool SaveCoreOptions::HasSpecifiedThreads() const {
+  return !m_threads_to_save.empty();
+}
+
+void SaveCoreOptions::AddMemoryRegionToSave(
+    const lldb_private::MemoryRegionInfo &region) {
+  m_regions_to_save.Insert(region.GetRange(), /*combine=*/true);
+}
+
+const MemoryRanges &SaveCoreOptions::GetCoreFileMemoryRanges() const {
+  return m_regions_to_save;
+}
+
 Status SaveCoreOptions::EnsureValidConfiguration(
     lldb::ProcessSP process_sp) const {
   Status error;
@@ -131,4 +144,5 @@ void SaveCoreOptions::Clear() {
   m_style = std::nullopt;
   m_threads_to_save.clear();
   m_process_sp.reset();
+  m_regions_to_save.Clear();
 }
diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt
index a42c44b..a6d2eac 100644
--- a/lldb/source/Target/CMakeLists.txt
+++ b/lldb/source/Target/CMakeLists.txt
@@ -11,6 +11,7 @@ add_lldb_library(lldbTarget
   ABI.cpp
   AssertFrameRecognizer.cpp
   DynamicRegisterInfo.cpp
+  CoreFileMemoryRanges.cpp
   ExecutionContext.cpp
   InstrumentationRuntime.cpp
   InstrumentationRuntimeStopInfo.cpp
diff --git a/lldb/source/Target/CoreFileMemoryRanges.cpp b/lldb/source/Target/CoreFileMemoryRanges.cpp
new file mode 100644
index 0000000..6e4ca49
--- /dev/null
+++ b/lldb/source/Target/CoreFileMemoryRanges.cpp
@@ -0,0 +1,86 @@
+//===-- CoreFileMemoryRanges.cpp --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Target/CoreFileMemoryRanges.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+using Entry = CoreFileMemoryRanges::Entry;
+
+static bool Overlaps(const Entry *region_one, const Entry *region_two) {
+  return !(region_one->GetRangeEnd() < region_two->GetRangeBase() ||
+           region_two->GetRangeEnd() < region_one->GetRangeBase());
+}
+
+static bool IntersectHelper(const Entry *region_one, const Entry *region_two) {
+  return region_one->GetRangeBase() == region_two->GetRangeEnd() ||
+         region_one->GetRangeEnd() == region_two->GetRangeBase();
+}
+
+static bool OnlyIntersects(const Entry *region_one, const Entry *region_two) {
+  return IntersectHelper(region_one, region_two) ||
+         IntersectHelper(region_two, region_one);
+}
+
+static bool PermissionsMatch(const Entry *region_one, const Entry *region_two) {
+  return region_one->data.lldb_permissions == region_two->data.lldb_permissions;
+}
+
+// This assumes any overlapping ranges will share the same permissions
+// and that adjacent ranges could have different permissions.
+Status CoreFileMemoryRanges::FinalizeCoreFileSaveRanges() {
+  Status error;
+  this->Sort();
+  for (size_t i = this->GetSize() - 1; i > 0; i--) {
+    auto region_one = this->GetMutableEntryAtIndex(i);
+    auto region_two = this->GetMutableEntryAtIndex(i - 1);
+    if (Overlaps(region_one, region_two)) {
+      // It's okay for interesecting regions to have different permissions but
+      // if they overlap we fail because we don't know what to do with them.
+      if (!PermissionsMatch(region_one, region_two)) {
+        // Permissions mismatch and it's not a simple intersection.
+        if (!OnlyIntersects(region_one, region_two)) {
+          error = Status::FromErrorStringWithFormatv(
+              "Memory region at {0}::{1} has different permssions than "
+              "overlapping region at {2}::{3}",
+              region_one->GetRangeBase(), region_one->GetRangeEnd(),
+              region_two->GetRangeBase(), region_two->GetRangeEnd());
+          return error;
+        }
+        // Simple intersection, we can just not merge these.
+        else
+          continue;
+      }
+      const addr_t base =
+          std::min(region_one->GetRangeBase(), region_two->GetRangeBase());
+      const addr_t byte_size =
+          std::max(region_one->GetRangeEnd(), region_two->GetRangeEnd()) - base;
+
+      region_two->SetRangeBase(base);
+      region_two->SetByteSize(byte_size);
+
+      // Because this is a range data vector, the entry has a base as well
+      // as the data contained in the entry. So we have to update both.
+      // And llvm::AddressRange isn't mutable so we have to create a new one.
+      llvm::AddressRange range(base, base + byte_size);
+      const CoreFileMemoryRange core_range = {
+          range, region_two->data.lldb_permissions};
+      region_two->data = core_range;
+      // Erase is delete from [Inclusive, exclusive index).
+      if (!this->Erase(i, i + 1)) {
+        error = Status::FromErrorStringWithFormat(
+            "Core file memory ranges mutated outside of "
+            "CalculateCoreFileSaveRanges");
+        return error;
+      }
+    }
+  }
+
+  return error;
+}
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 40f3115..aca0897 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -6463,7 +6463,7 @@ Status Process::WriteMemoryTags(lldb::addr_t addr, size_t len,
 }
 
 // Create a CoreFileMemoryRange from a MemoryRegionInfo
-static Process::CoreFileMemoryRange
+static CoreFileMemoryRange
 CreateCoreFileMemoryRange(const MemoryRegionInfo &region) {
   const addr_t addr = region.GetRange().GetRangeBase();
   llvm::AddressRange range(addr, addr + region.GetRange().GetByteSize());
@@ -6474,7 +6474,7 @@ CreateCoreFileMemoryRange(const MemoryRegionInfo &region) {
 // were added. Return false if the dirty page information is not valid or in
 // the region.
 static bool AddDirtyPages(const MemoryRegionInfo &region,
-                          Process::CoreFileMemoryRanges &ranges) {
+                          CoreFileMemoryRanges &ranges) {
   const auto &dirty_page_list = region.GetDirtyPageList();
   if (!dirty_page_list)
     return false;
@@ -6494,14 +6494,14 @@ static bool AddDirtyPages(const MemoryRegionInfo &region,
       } else {
         // Add previous contiguous range and init the new range with the
         // current dirty page.
-        ranges.push_back({range, lldb_permissions});
+        ranges.Append(range.start(), range.size(), {range, lldb_permissions});
         range = llvm::AddressRange(page_addr, page_addr + page_size);
       }
     }
   }
   // The last range
   if (!range.empty())
-    ranges.push_back({range, lldb_permissions});
+    ranges.Append(range.start(), range.size(), {range, lldb_permissions});
   return true;
 }
 
@@ -6513,7 +6513,7 @@ static bool AddDirtyPages(const MemoryRegionInfo &region,
 // will be added to \a ranges, else the entire range will be added to \a
 // ranges.
 static void AddRegion(const MemoryRegionInfo &region, bool try_dirty_pages,
-                      Process::CoreFileMemoryRanges &ranges) {
+                      CoreFileMemoryRanges &ranges) {
   // Don't add empty ranges.
   if (region.GetRange().GetByteSize() == 0)
     return;
@@ -6522,13 +6522,17 @@ static void AddRegion(const MemoryRegionInfo &region, bool try_dirty_pages,
     return;
   if (try_dirty_pages && AddDirtyPages(region, ranges))
     return;
-  ranges.push_back(CreateCoreFileMemoryRange(region));
+
+  ranges.Append(region.GetRange().GetRangeBase(),
+                region.GetRange().GetByteSize(),
+                CreateCoreFileMemoryRange(region));
 }
 
-static void SaveOffRegionsWithStackPointers(
-    Process &process, const SaveCoreOptions &core_options,
-    const MemoryRegionInfos &regions, Process::CoreFileMemoryRanges &ranges,
-    std::set<addr_t> &stack_ends) {
+static void SaveOffRegionsWithStackPointers(Process &process,
+                                            const SaveCoreOptions &core_options,
+                                            const MemoryRegionInfos &regions,
+                                            CoreFileMemoryRanges &ranges,
+                                            std::set<addr_t> &stack_ends) {
   const bool try_dirty_pages = true;
 
   // Before we take any dump, we want to save off the used portions of the
@@ -6568,11 +6572,11 @@ static void SaveOffRegionsWithStackPointers(
 // for a full core file style.
 static void GetCoreFileSaveRangesFull(Process &process,
                                       const MemoryRegionInfos &regions,
-                                      Process::CoreFileMemoryRanges &ranges,
+                                      CoreFileMemoryRanges &ranges,
                                       std::set<addr_t> &stack_ends) {
 
   // Don't add only dirty pages, add full regions.
-const bool try_dirty_pages = false;
+  const bool try_dirty_pages = false;
   for (const auto &region : regions)
     if (stack_ends.count(region.GetRange().GetRangeEnd()) == 0)
       AddRegion(region, try_dirty_pages, ranges);
@@ -6582,9 +6586,10 @@ const bool try_dirty_pages = false;
 // least some dirty pages, as some OS versions don't support reporting what
 // pages are dirty within an memory region. If no memory regions have dirty
 // page information fall back to saving out all ranges with write permissions.
-static void GetCoreFileSaveRangesDirtyOnly(
-    Process &process, const MemoryRegionInfos &regions,
-    Process::CoreFileMemoryRanges &ranges, std::set<addr_t> &stack_ends) {
+static void GetCoreFileSaveRangesDirtyOnly(Process &process,
+                                           const MemoryRegionInfos &regions,
+                                           CoreFileMemoryRanges &ranges,
+                                           std::set<addr_t> &stack_ends) {
 
   // Iterate over the regions and find all dirty pages.
   bool have_dirty_page_info = false;
@@ -6613,9 +6618,10 @@ static void GetCoreFileSaveRangesDirtyOnly(
 // dirty regions as this will make the core file smaller. If the process
 // doesn't support dirty regions, then it will fall back to adding the full
 // stack region.
-static void GetCoreFileSaveRangesStackOnly(
-    Process &process, const MemoryRegionInfos &regions,
-    Process::CoreFileMemoryRanges &ranges, std::set<addr_t> &stack_ends) {
+static void GetCoreFileSaveRangesStackOnly(Process &process,
+                                           const MemoryRegionInfos &regions,
+                                           CoreFileMemoryRanges &ranges,
+                                           std::set<addr_t> &stack_ends) {
   const bool try_dirty_pages = true;
   // Some platforms support annotating the region information that tell us that
   // it comes from a thread stack. So look for those regions first.
@@ -6628,6 +6634,24 @@ static void GetCoreFileSaveRangesStackOnly(
   }
 }
 
+static void GetUserSpecifiedCoreFileSaveRanges(Process &process,
+                                               const MemoryRegionInfos &regions,
+                                               const SaveCoreOptions &options,
+                                               CoreFileMemoryRanges &ranges) {
+  const auto &option_ranges = options.GetCoreFileMemoryRanges();
+  if (option_ranges.IsEmpty())
+    return;
+
+  for (const auto &range : regions) {
+    auto entry = option_ranges.FindEntryThatContains(range.GetRange());
+    if (entry) {
+      ranges.Append(range.GetRange().GetRangeBase(),
+                    range.GetRange().GetByteSize(),
+                    CreateCoreFileMemoryRange(range));
+    }
+  }
+}
+
 Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options,
                                             CoreFileMemoryRanges &ranges) {
   lldb_private::MemoryRegionInfos regions;
@@ -6643,11 +6667,18 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options,
         "callers must set the core_style to something other than "
         "eSaveCoreUnspecified");
 
+  GetUserSpecifiedCoreFileSaveRanges(*this, regions, options, ranges);
+
   std::set<addr_t> stack_ends;
-  SaveOffRegionsWithStackPointers(*this, options, regions, ranges, stack_ends);
+  // For fully custom set ups, we don't want to even look at threads if there
+  // are no threads specified.
+  if (core_style != lldb::eSaveCoreCustomOnly || options.HasSpecifiedThreads())
+    SaveOffRegionsWithStackPointers(*this, options, regions, ranges,
+                                    stack_ends);
 
   switch (core_style) {
   case eSaveCoreUnspecified:
+  case eSaveCoreCustomOnly:
     break;
 
   case eSaveCoreFull:
@@ -6666,10 +6697,11 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options,
   if (err.Fail())
     return err;
 
-  if (ranges.empty())
-    return Status("no valid address ranges found for core style");
+  if (ranges.IsEmpty())
+    return Status::FromErrorStringWithFormat(
+        "no valid address ranges found for core style");
 
-  return Status(); // Success!
+  return ranges.FinalizeCoreFileSaveRanges();
 }
 
 std::vector<ThreadSP>
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
index ed15793..2cbe20e 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
@@ -344,3 +344,152 @@ class ProcessSaveCoreMinidumpTestCase(TestBase):
             self.assertTrue(self.dbg.DeleteTarget(target))
             if os.path.isfile(default_value_file):
                 os.unlink(default_value_file)
+
+    @skipUnlessArch("x86_64")
+    @skipUnlessPlatform(["linux"])
+    def test_save_linux_minidump_one_region(self):
+        """Test that we can save a Linux mini dump with one region in sbsavecore regions"""
+
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        one_region_file = self.getBuildArtifact("core.one_region.dmp")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
+            self.assertState(process.GetState(), lldb.eStateStopped)
+
+            memory_region = lldb.SBMemoryRegionInfo()
+            memory_list = process.GetMemoryRegions()
+            memory_list.GetMemoryRegionAtIndex(0, memory_region)
+
+            # This is almost identical to the single thread test case because
+            # minidump defaults to stacks only, so we want to see if the
+            # default options work as expected.
+            options = lldb.SBSaveCoreOptions()
+            file_spec = lldb.SBFileSpec(one_region_file)
+            options.SetOutputFile(file_spec)
+            options.SetPluginName("minidump")
+            options.AddMemoryRegionToSave(memory_region)
+            options.SetStyle(lldb.eSaveCoreCustomOnly)
+            error = process.SaveCore(options)
+            print(f"Error: {error.GetCString()}")
+            self.assertTrue(error.Success(), error.GetCString())
+
+            core_target = self.dbg.CreateTarget(None)
+            core_proc = core_target.LoadCore(one_region_file)
+            core_memory_list = core_proc.GetMemoryRegions()
+            # Note because the /proc/pid maps are included on linux, we can't
+            # depend on size for validation, so we'll ensure the first region
+            # is present and then assert we fail on the second.
+            core_memory_region = lldb.SBMemoryRegionInfo()
+            core_memory_list.GetMemoryRegionAtIndex(0, core_memory_region)
+            self.assertEqual(
+                core_memory_region.GetRegionBase(), memory_region.GetRegionBase()
+            )
+            self.assertEqual(
+                core_memory_region.GetRegionEnd(), memory_region.GetRegionEnd()
+            )
+
+            region_two = lldb.SBMemoryRegionInfo()
+            core_memory_list.GetMemoryRegionAtIndex(1, region_two)
+            err = lldb.SBError()
+            content = core_proc.ReadMemory(region_two.GetRegionBase(), 1, err)
+            self.assertTrue(err.Fail(), "Should fail to read memory")
+
+        finally:
+            self.assertTrue(self.dbg.DeleteTarget(target))
+            if os.path.isfile(one_region_file):
+                os.unlink(one_region_file)
+
+    @skipUnlessArch("x86_64")
+    @skipUnlessPlatform(["linux"])
+    def test_save_minidump_custom_save_style(self):
+        """Test that verifies a custom and unspecified save style fails for
+        containing no data to save"""
+
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        custom_file = self.getBuildArtifact("core.custom.dmp")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
+            self.assertState(process.GetState(), lldb.eStateStopped)
+
+            options = lldb.SBSaveCoreOptions()
+            options.SetOutputFile(lldb.SBFileSpec(custom_file))
+            options.SetPluginName("minidump")
+            options.SetStyle(lldb.eSaveCoreCustomOnly)
+
+            error = process.SaveCore(options)
+            self.assertTrue(error.Fail())
+            self.assertEqual(
+                error.GetCString(), "no valid address ranges found for core style"
+            )
+
+        finally:
+            self.assertTrue(self.dbg.DeleteTarget(target))
+            if os.path.isfile(custom_file):
+                os.unlink(custom_file)
+
+    def save_core_with_region(self, process, region_index):
+        try:
+            custom_file = self.getBuildArtifact("core.custom.dmp")
+            memory_region = lldb.SBMemoryRegionInfo()
+            memory_list = process.GetMemoryRegions()
+            memory_list.GetMemoryRegionAtIndex(0, memory_region)
+            options = lldb.SBSaveCoreOptions()
+            options.SetOutputFile(lldb.SBFileSpec(custom_file))
+            options.SetPluginName("minidump")
+            options.SetStyle(lldb.eSaveCoreFull)
+
+            error = process.SaveCore(options)
+            self.assertTrue(error.Success())
+            core_target = self.dbg.CreateTarget(None)
+            core_proc = core_target.LoadCore(custom_file)
+            core_memory_list = core_proc.GetMemoryRegions()
+            # proc/pid/ maps are included on linux, so we can't depend on size
+            # for validation, we make a set of all the ranges,
+            # and ensure no duplicates!
+            range_set = set()
+            for x in range(core_memory_list.GetSize()):
+                core_memory_region = lldb.SBMemoryRegionInfo()
+                core_memory_list.GetMemoryRegionAtIndex(x, core_memory_region)
+                mem_tuple = (
+                    core_memory_region.GetRegionBase(),
+                    core_memory_region.GetRegionEnd(),
+                )
+                self.assertTrue(
+                    mem_tuple not in range_set, "Duplicate memory region found"
+                )
+                range_set.add(mem_tuple)
+        finally:
+            if os.path.isfile(custom_file):
+                os.unlink(custom_file)
+
+    @skipUnlessArch("x86_64")
+    @skipUnlessPlatform(["linux"])
+    def test_save_minidump_custom_save_style_duplicated_regions(self):
+        """Test that verifies a custom and unspecified save style fails for
+        containing no data to save"""
+
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
+            self.assertState(process.GetState(), lldb.eStateStopped)
+
+            memory_list = process.GetMemoryRegions()
+            # Test that we don't duplicate regions, by duplicating regions
+            # at various indices.
+            self.save_core_with_region(process, 0)
+            self.save_core_with_region(process, len(memory_list) - 1)
+
+        finally:
+            self.assertTrue(self.dbg.DeleteTarget(target))
diff --git a/lldb/test/Shell/SymbolFile/Inputs/main.c b/lldb/test/Shell/SymbolFile/Inputs/main.c
new file mode 100644
index 0000000..341417f
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/Inputs/main.c
@@ -0,0 +1,4 @@
+int main(int argc, char **argv) {
+  // Break on main.
+  return 1;
+}
diff --git a/lldb/test/Shell/SymbolFile/checksum-mismatch.test b/lldb/test/Shell/SymbolFile/checksum-mismatch.test
new file mode 100644
index 0000000..5db9764
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/checksum-mismatch.test
@@ -0,0 +1,7 @@
+RUN: mkdir -p %t
+RUN: cp %S/Inputs/main.c %t/main.c
+RUN: %clang_host %t/main.c -std=c99 -gdwarf-5 -o %t/main.out
+RUN: echo "// Modify source file hash" >> %t/main.c
+RUN: %lldb -b %t/main.out -o 'b main' -o 'r' 2>&1 | FileCheck %s
+
+CHECK: warning: main.c: source file checksum mismatch between line table ({{.*}}) and file on disk ({{.*}})
diff --git a/lldb/unittests/Process/Utility/CMakeLists.txt b/lldb/unittests/Process/Utility/CMakeLists.txt
index 651f871..ec0ff95 100644
--- a/lldb/unittests/Process/Utility/CMakeLists.txt
+++ b/lldb/unittests/Process/Utility/CMakeLists.txt
@@ -18,6 +18,7 @@ add_lldb_unittest(ProcessUtilityTests
   LinuxProcMapsTest.cpp
   MemoryTagManagerAArch64MTETest.cpp
   RegisterContextTest.cpp
+  CoreFileMemoryRangesTest.cpp
   ${PLATFORM_SOURCES}
 
   LINK_LIBS
diff --git a/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp b/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp
new file mode 100644
index 0000000..6d514b1
--- /dev/null
+++ b/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp
@@ -0,0 +1,205 @@
+//===-- CoreFileMemoryRangesTests.cpp
+//---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include "lldb/Target/CoreFileMemoryRanges.h"
+#include "lldb/lldb-types.h"
+
+using namespace lldb_private;
+
+TEST(CoreFileMemoryRangesTest, MapOverlappingRanges) {
+  lldb_private::CoreFileMemoryRanges ranges;
+  const lldb::addr_t start_addr = 0x1000;
+  const lldb::addr_t increment_addr = 0x1000;
+  const size_t iterations = 10;
+  for (size_t i = 0; i < iterations; i++) {
+    const lldb::addr_t start = start_addr + (i * increment_addr);
+    const lldb::addr_t end = start + increment_addr;
+    // Arbitrary value
+    const uint32_t permissions = 0x3;
+    llvm::AddressRange range(start, end);
+    const CoreFileMemoryRange core_range = {range, permissions};
+    // The range data is Start, Size, While the range is start-end.
+    CoreFileMemoryRanges::Entry entry = {start, end - start, core_range};
+    ranges.Append(entry);
+  }
+
+  Status error = ranges.FinalizeCoreFileSaveRanges();
+  EXPECT_TRUE(error.Success());
+  ASSERT_THAT(1, ranges.GetSize());
+  const auto range = ranges.GetEntryAtIndex(0);
+  ASSERT_TRUE(range);
+  ASSERT_THAT(start_addr, range->GetRangeBase());
+  ASSERT_THAT(start_addr + (iterations * increment_addr), range->GetRangeEnd());
+}
+
+TEST(CoreFileMemoryRangesTest, RangesSplitByPermissions) {
+  lldb_private::CoreFileMemoryRanges ranges;
+  const lldb::addr_t start_addr = 0x1000;
+  const lldb::addr_t increment_addr = 0x1000;
+  const size_t iterations = 10;
+  for (size_t i = 0; i < iterations; i++) {
+    const lldb::addr_t start = start_addr + (i * increment_addr);
+    const lldb::addr_t end = start + increment_addr;
+    const uint32_t permissions = i;
+    llvm::AddressRange range(start, end);
+    const CoreFileMemoryRange core_range = {range, permissions};
+    // The range data is Start, Size, While the range is start-end.
+    CoreFileMemoryRanges::Entry entry = {start, end - start, core_range};
+    ranges.Append(entry);
+  }
+
+  Status error = ranges.FinalizeCoreFileSaveRanges();
+  EXPECT_TRUE(error.Success());
+  ASSERT_THAT(10, ranges.GetSize());
+  const auto range = ranges.GetEntryAtIndex(0);
+  ASSERT_TRUE(range);
+  ASSERT_THAT(start_addr, range->GetRangeBase());
+  ASSERT_THAT(start_addr + increment_addr, range->GetRangeEnd());
+}
+
+TEST(CoreFileMemoryRangesTest, MapPartialOverlappingRanges) {
+  lldb_private::CoreFileMemoryRanges ranges;
+  const lldb::addr_t start_addr = 0x1000;
+  const lldb::addr_t increment_addr = 0x1000;
+  const size_t iterations = 10;
+  for (size_t i = 0; i < iterations; i++) {
+    const lldb::addr_t start = start_addr + (i * increment_addr);
+    const lldb::addr_t end = start + increment_addr;
+    // Arbitrary value
+    const uint32_t permissions = 0x3;
+    llvm::AddressRange range(start, end);
+    const CoreFileMemoryRange core_range = {range, permissions};
+    // The range data is Start, Size, While the range is start-end.
+    CoreFileMemoryRanges::Entry entry = {start, end - start, core_range};
+    ranges.Append(entry);
+  }
+
+  const lldb::addr_t unique_start = 0x7fff0000;
+  const lldb::addr_t unique_end = unique_start + increment_addr;
+  llvm::AddressRange range(unique_start, unique_end);
+  const uint32_t permissions = 0x3;
+  const CoreFileMemoryRange core_range = {range, permissions};
+  // The range data is Start, Size, While the range is start-end.
+  CoreFileMemoryRanges::Entry entry = {unique_start, unique_end - unique_start,
+                                       core_range};
+  ranges.Append(entry);
+
+  Status error = ranges.FinalizeCoreFileSaveRanges();
+  EXPECT_TRUE(error.Success());
+  ASSERT_THAT(2, ranges.GetSize());
+  const auto merged_range = ranges.GetEntryAtIndex(0);
+  ASSERT_TRUE(merged_range);
+  ASSERT_THAT(start_addr, merged_range->GetRangeBase());
+  ASSERT_THAT(start_addr + (iterations * increment_addr),
+              merged_range->GetRangeEnd());
+  const auto unique_range = ranges.GetEntryAtIndex(1);
+  ASSERT_TRUE(unique_range);
+  ASSERT_THAT(unique_start, unique_range->GetRangeBase());
+  ASSERT_THAT(unique_end, unique_range->GetRangeEnd());
+}
+
+TEST(CoreFileMemoryRangesTest, SuperiorAndInferiorRanges_SamePermissions) {
+  lldb_private::CoreFileMemoryRanges ranges;
+  const lldb::addr_t start_addr = 0x1000;
+  const lldb::addr_t increment_addr = 0x1000;
+  const lldb::addr_t superior_region_end = start_addr + increment_addr * 10;
+  llvm::AddressRange range(start_addr, superior_region_end);
+  const CoreFileMemoryRange core_range = {range, 0x3};
+  CoreFileMemoryRanges::Entry entry = {
+      start_addr, superior_region_end - start_addr, core_range};
+  ranges.Append(entry);
+  const lldb::addr_t inferior_region_end = start_addr + increment_addr;
+  llvm::AddressRange inferior_range(start_addr, inferior_region_end);
+  const CoreFileMemoryRange inferior_core_range = {inferior_range, 0x3};
+  CoreFileMemoryRanges::Entry inferior_entry = {
+      start_addr, inferior_region_end - start_addr, inferior_core_range};
+  ranges.Append(inferior_entry);
+
+  Status error = ranges.FinalizeCoreFileSaveRanges();
+  EXPECT_TRUE(error.Success());
+  ASSERT_THAT(1, ranges.GetSize());
+  const auto searched_range = ranges.GetEntryAtIndex(0);
+  ASSERT_TRUE(searched_range);
+  ASSERT_THAT(start_addr, searched_range->GetRangeBase());
+  ASSERT_THAT(superior_region_end, searched_range->GetRangeEnd());
+}
+
+TEST(CoreFileMemoryRangesTest, SuperiorAndInferiorRanges_DifferentPermissions) {
+  lldb_private::CoreFileMemoryRanges ranges;
+  const lldb::addr_t start_addr = 0x1000;
+  const lldb::addr_t increment_addr = 0x1000;
+  const lldb::addr_t superior_region_end = start_addr + increment_addr * 10;
+  llvm::AddressRange range(start_addr, superior_region_end);
+  const CoreFileMemoryRange core_range = {range, 0x3};
+  CoreFileMemoryRanges::Entry entry = {
+      start_addr, superior_region_end - start_addr, core_range};
+  ranges.Append(entry);
+  const lldb::addr_t inferior_region_end = start_addr + increment_addr;
+  llvm::AddressRange inferior_range(start_addr, inferior_region_end);
+  const CoreFileMemoryRange inferior_core_range = {inferior_range, 0x4};
+  CoreFileMemoryRanges::Entry inferior_entry = {
+      start_addr, inferior_region_end - start_addr, inferior_core_range};
+  ranges.Append(inferior_entry);
+
+  Status error = ranges.FinalizeCoreFileSaveRanges();
+  EXPECT_TRUE(error.Fail());
+}
+
+TEST(CoreFileMemoryRangesTest, NonIntersectingRangesSamePermissions) {
+  const int permissions = 0x7;
+  lldb_private::CoreFileMemoryRanges ranges;
+  const lldb::addr_t region_one_start = 0x1000;
+  const lldb::addr_t region_one_end = 0x2000;
+  llvm::AddressRange range_one(region_one_start, region_one_end);
+  const CoreFileMemoryRange core_range_one = {range_one, permissions};
+  CoreFileMemoryRanges::Entry entry_one = {
+      region_one_start, region_one_end - region_one_start, core_range_one};
+  ranges.Append(entry_one);
+  const lldb::addr_t region_two_start = 0xb000;
+  const lldb::addr_t region_two_end = 0xc000;
+  llvm::AddressRange range_two(region_two_start, region_two_end);
+  const CoreFileMemoryRange core_range_two = {range_two, permissions};
+  CoreFileMemoryRanges::Entry entry_two = {
+      region_two_start, region_two_end - region_two_start, core_range_two};
+  ranges.Append(entry_two);
+
+  Status error = ranges.FinalizeCoreFileSaveRanges();
+  EXPECT_TRUE(error.Success());
+  ASSERT_THAT(2UL, ranges.GetSize());
+  ASSERT_THAT(region_one_start, ranges.GetEntryAtIndex(0)->GetRangeBase());
+  ASSERT_THAT(region_two_start, ranges.GetEntryAtIndex(1)->GetRangeBase());
+}
+
+TEST(CoreFileMemoryRangesTest, PartialOverlapping) {
+  const int permissions = 0x3;
+  lldb_private::CoreFileMemoryRanges ranges;
+  const lldb::addr_t start_addr = 0x1000;
+  const lldb::addr_t end_addr = 0x2000;
+  llvm::AddressRange range_one(start_addr, end_addr);
+  const CoreFileMemoryRange core_range_one = {range_one, permissions};
+  CoreFileMemoryRanges::Entry entry_one = {start_addr, end_addr - start_addr,
+                                           core_range_one};
+  llvm::AddressRange range_two(start_addr / 2, end_addr / 2);
+  const CoreFileMemoryRange core_range_two = {range_two, permissions};
+  CoreFileMemoryRanges::Entry entry_two = {
+      start_addr / 2, end_addr / 2 - start_addr / 2, core_range_two};
+  ranges.Append(entry_one);
+  ranges.Append(entry_two);
+
+  Status error = ranges.FinalizeCoreFileSaveRanges();
+  EXPECT_TRUE(error.Success());
+  ASSERT_THAT(1, ranges.GetSize());
+  const auto searched_range = ranges.GetEntryAtIndex(0);
+  ASSERT_TRUE(searched_range);
+  ASSERT_THAT(start_addr / 2, searched_range->GetRangeBase());
+  ASSERT_THAT(end_addr, searched_range->GetRangeEnd());
+}
diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp
index c646ba6..ca2cab6 100644
--- a/llvm/benchmarks/SandboxIRBench.cpp
+++ b/llvm/benchmarks/SandboxIRBench.cpp
@@ -34,15 +34,19 @@ static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
 }
 
 enum class IR {
-  LLVM,
-  SBox,
+  LLVM,           ///> LLVM IR
+  SBoxNoTracking, ///> Sandbox IR with tracking disabled
+  SBoxTracking,   ///> Sandbox IR with tracking enabled
 };
 // Traits to get llvm::BasicBlock/sandboxir::BasicBlock from IR::LLVM/IR::SBox.
 template <IR IRTy> struct TypeSelect {};
 template <> struct TypeSelect<IR::LLVM> {
   using BasicBlock = llvm::BasicBlock;
 };
-template <> struct TypeSelect<IR::SBox> {
+template <> struct TypeSelect<IR::SBoxNoTracking> {
+  using BasicBlock = sandboxir::BasicBlock;
+};
+template <> struct TypeSelect<IR::SBoxTracking> {
   using BasicBlock = sandboxir::BasicBlock;
 };
 
@@ -59,12 +63,22 @@ genIR(std::unique_ptr<llvm::Module> &LLVMM, LLVMContext &LLVMCtx,
 
   sandboxir::Function *F = Ctx.createFunction(LLVMF);
   sandboxir::BasicBlock *BB = &*F->begin();
+  // Start tracking if we are testing with tracking enabled.
+  if constexpr (IRTy == IR::SBoxTracking)
+    Ctx.save();
+
   if constexpr (IRTy == IR::LLVM)
     return LLVMBB;
   else
     return BB;
 }
 
+template <IR IRTy> static void finalize(sandboxir::Context &Ctx) {
+  // Accept changes if we are tracking.
+  if constexpr (IRTy == IR::SBoxTracking)
+    Ctx.accept();
+}
+
 static std::string generateBBWalkIR(unsigned Size) {
   std::stringstream SS;
   SS << "define void @foo(i32 %v1, i32 %v2) {\n";
@@ -132,15 +146,61 @@ template <IR IRTy> static void RAUW(benchmark::State &State) {
     Def1->replaceAllUsesWith(Def2);
     Def2->replaceAllUsesWith(Def1);
   }
+  finalize<IRTy>(Ctx);
+}
+
+static std::string generateRUOWIR(unsigned NumOperands) {
+  std::stringstream SS;
+  auto GenOps = [&SS, NumOperands]() {
+    for (auto Cnt : seq<unsigned>(0, NumOperands)) {
+      SS << "i8 %arg" << Cnt;
+      bool IsLast = Cnt + 1 == NumOperands;
+      if (!IsLast)
+        SS << ", ";
+    }
+  };
+
+  SS << "define void @foo(";
+  GenOps();
+  SS << ") {\n";
+
+  SS << "   call void @foo(";
+  GenOps();
+  SS << ")\n";
+  SS << "ret void";
+  SS << "}";
+  return SS.str();
+}
+
+template <IR IRTy> static void RUOW(benchmark::State &State) {
+  LLVMContext LLVMCtx;
+  sandboxir::Context Ctx(LLVMCtx);
+  std::unique_ptr<llvm::Module> LLVMM;
+  unsigned NumOperands = State.range(0);
+  auto *BB = genIR<IRTy>(LLVMM, LLVMCtx, Ctx, generateRUOWIR, NumOperands);
+
+  auto It = BB->begin();
+  auto *F = BB->getParent();
+  auto *Arg0 = F->getArg(0);
+  auto *Arg1 = F->getArg(1);
+  auto *Call = &*It++;
+  for (auto _ : State)
+    Call->replaceUsesOfWith(Arg0, Arg1);
+  finalize<IRTy>(Ctx);
 }
 
 BENCHMARK(GetType<IR::LLVM>);
-BENCHMARK(GetType<IR::SBox>);
+BENCHMARK(GetType<IR::SBoxNoTracking>);
 
 BENCHMARK(BBWalk<IR::LLVM>)->Args({1024});
-BENCHMARK(BBWalk<IR::SBox>)->Args({1024});
+BENCHMARK(BBWalk<IR::SBoxTracking>)->Args({1024});
 
 BENCHMARK(RAUW<IR::LLVM>)->Args({512});
-BENCHMARK(RAUW<IR::SBox>)->Args({512});
+BENCHMARK(RAUW<IR::SBoxNoTracking>)->Args({512});
+BENCHMARK(RAUW<IR::SBoxTracking>)->Args({512});
+
+BENCHMARK(RUOW<IR::LLVM>)->Args({4096});
+BENCHMARK(RUOW<IR::SBoxNoTracking>)->Args({4096});
+BENCHMARK(RUOW<IR::SBoxTracking>)->Args({4096});
 
 BENCHMARK_MAIN();
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 00290c9..083d5c9 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -109,7 +109,8 @@ public:
 
   void clear() {
     incrementEpoch();
-    if (getNumEntries() == 0 && getNumTombstones() == 0) return;
+    if (getNumEntries() == 0 && getNumTombstones() == 0)
+      return;
 
     // If the capacity of the array is huge, and the # elements used is small,
     // shrink the array.
@@ -119,7 +120,7 @@ public:
     }
 
     const KeyT EmptyKey = getEmptyKey();
-    if (std::is_trivially_destructible<ValueT>::value) {
+    if constexpr (std::is_trivially_destructible_v<ValueT>) {
       // Use a simpler loop when values don't need destruction.
       for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P)
         P->getFirst() = EmptyKey;
@@ -172,15 +173,14 @@ public:
   /// The DenseMapInfo is responsible for supplying methods
   /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key
   /// type used.
-  template<class LookupKeyT>
-  iterator find_as(const LookupKeyT &Val) {
+  template <class LookupKeyT> iterator find_as(const LookupKeyT &Val) {
     if (BucketT *Bucket = doFind(Val))
       return makeIterator(
           Bucket, shouldReverseIterate<KeyT>() ? getBuckets() : getBucketsEnd(),
           *this, true);
     return end();
   }
-  template<class LookupKeyT>
+  template <class LookupKeyT>
   const_iterator find_as(const LookupKeyT &Val) const {
     if (const BucketT *Bucket = doFind(Val))
       return makeConstIterator(
@@ -223,7 +223,7 @@ public:
   // The value is constructed in-place if the key is not in the map, otherwise
   // it is not moved.
   template <typename... Ts>
-  std::pair<iterator, bool> try_emplace(KeyT &&Key, Ts &&... Args) {
+  std::pair<iterator, bool> try_emplace(KeyT &&Key, Ts &&...Args) {
     BucketT *TheBucket;
     if (LookupBucketFor(Key, TheBucket))
       return std::make_pair(makeIterator(TheBucket,
@@ -248,7 +248,7 @@ public:
   // The value is constructed in-place if the key is not in the map, otherwise
   // it is not moved.
   template <typename... Ts>
-  std::pair<iterator, bool> try_emplace(const KeyT &Key, Ts &&... Args) {
+  std::pair<iterator, bool> try_emplace(const KeyT &Key, Ts &&...Args) {
     BucketT *TheBucket;
     if (LookupBucketFor(Key, TheBucket))
       return std::make_pair(makeIterator(TheBucket,
@@ -297,8 +297,7 @@ public:
   }
 
   /// insert - Range insertion of pairs.
-  template<typename InputIt>
-  void insert(InputIt I, InputIt E) {
+  template <typename InputIt> void insert(InputIt I, InputIt E) {
     for (; I != E; ++I)
       insert(*I);
   }
@@ -420,7 +419,7 @@ protected:
     setNumEntries(0);
     setNumTombstones(0);
 
-    assert((getNumBuckets() & (getNumBuckets()-1)) == 0 &&
+    assert((getNumBuckets() & (getNumBuckets() - 1)) == 0 &&
            "# initial buckets must be a power of two!");
     const KeyT EmptyKey = getEmptyKey();
     for (BucketT *B = getBuckets(), *E = getBucketsEnd(); B != E; ++B)
@@ -491,7 +490,7 @@ protected:
     return KeyInfoT::getHashValue(Val);
   }
 
-  template<typename LookupKeyT>
+  template <typename LookupKeyT>
   static unsigned getHashValue(const LookupKeyT &Val) {
     return KeyInfoT::getHashValue(Val);
   }
@@ -502,14 +501,11 @@ protected:
     return KeyInfoT::getEmptyKey();
   }
 
-  static const KeyT getTombstoneKey() {
-    return KeyInfoT::getTombstoneKey();
-  }
+  static const KeyT getTombstoneKey() { return KeyInfoT::getTombstoneKey(); }
 
 private:
-  iterator makeIterator(BucketT *P, BucketT *E,
-                        DebugEpochBase &Epoch,
-                        bool NoAdvance=false) {
+  iterator makeIterator(BucketT *P, BucketT *E, DebugEpochBase &Epoch,
+                        bool NoAdvance = false) {
     if (shouldReverseIterate<KeyT>()) {
       BucketT *B = P == getBucketsEnd() ? getBuckets() : P + 1;
       return iterator(B, E, Epoch, NoAdvance);
@@ -519,7 +515,7 @@ private:
 
   const_iterator makeConstIterator(const BucketT *P, const BucketT *E,
                                    const DebugEpochBase &Epoch,
-                                   const bool NoAdvance=false) const {
+                                   const bool NoAdvance = false) const {
     if (shouldReverseIterate<KeyT>()) {
       const BucketT *B = P == getBucketsEnd() ? getBuckets() : P + 1;
       return const_iterator(B, E, Epoch, NoAdvance);
@@ -535,13 +531,9 @@ private:
     static_cast<DerivedT *>(this)->setNumEntries(Num);
   }
 
-  void incrementNumEntries() {
-    setNumEntries(getNumEntries() + 1);
-  }
+  void incrementNumEntries() { setNumEntries(getNumEntries() + 1); }
 
-  void decrementNumEntries() {
-    setNumEntries(getNumEntries() - 1);
-  }
+  void decrementNumEntries() { setNumEntries(getNumEntries() - 1); }
 
   unsigned getNumTombstones() const {
     return static_cast<const DerivedT *>(this)->getNumTombstones();
@@ -551,45 +543,33 @@ private:
     static_cast<DerivedT *>(this)->setNumTombstones(Num);
   }
 
-  void incrementNumTombstones() {
-    setNumTombstones(getNumTombstones() + 1);
-  }
+  void incrementNumTombstones() { setNumTombstones(getNumTombstones() + 1); }
 
-  void decrementNumTombstones() {
-    setNumTombstones(getNumTombstones() - 1);
-  }
+  void decrementNumTombstones() { setNumTombstones(getNumTombstones() - 1); }
 
   const BucketT *getBuckets() const {
     return static_cast<const DerivedT *>(this)->getBuckets();
   }
 
-  BucketT *getBuckets() {
-    return static_cast<DerivedT *>(this)->getBuckets();
-  }
+  BucketT *getBuckets() { return static_cast<DerivedT *>(this)->getBuckets(); }
 
   unsigned getNumBuckets() const {
     return static_cast<const DerivedT *>(this)->getNumBuckets();
   }
 
-  BucketT *getBucketsEnd() {
-    return getBuckets() + getNumBuckets();
-  }
+  BucketT *getBucketsEnd() { return getBuckets() + getNumBuckets(); }
 
   const BucketT *getBucketsEnd() const {
     return getBuckets() + getNumBuckets();
   }
 
-  void grow(unsigned AtLeast) {
-    static_cast<DerivedT *>(this)->grow(AtLeast);
-  }
+  void grow(unsigned AtLeast) { static_cast<DerivedT *>(this)->grow(AtLeast); }
 
-  void shrink_and_clear() {
-    static_cast<DerivedT *>(this)->shrink_and_clear();
-  }
+  void shrink_and_clear() { static_cast<DerivedT *>(this)->shrink_and_clear(); }
 
   template <typename KeyArg, typename... ValueArgs>
   BucketT *InsertIntoBucket(BucketT *TheBucket, KeyArg &&Key,
-                            ValueArgs &&... Values) {
+                            ValueArgs &&...Values) {
     TheBucket = InsertIntoBucketImpl(Key, Key, TheBucket);
 
     TheBucket->getFirst() = std::forward<KeyArg>(Key);
@@ -627,8 +607,9 @@ private:
       this->grow(NumBuckets * 2);
       LookupBucketFor(Lookup, TheBucket);
       NumBuckets = getNumBuckets();
-    } else if (LLVM_UNLIKELY(NumBuckets-(NewNumEntries+getNumTombstones()) <=
-                             NumBuckets/8)) {
+    } else if (LLVM_UNLIKELY(NumBuckets -
+                                 (NewNumEntries + getNumTombstones()) <=
+                             NumBuckets / 8)) {
       this->grow(NumBuckets);
       LookupBucketFor(Lookup, TheBucket);
     }
@@ -696,7 +677,7 @@ private:
            !KeyInfoT::isEqual(Val, TombstoneKey) &&
            "Empty/Tombstone value shouldn't be inserted into map!");
 
-    unsigned BucketNo = getHashValue(Val) & (NumBuckets-1);
+    unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
     unsigned ProbeAmt = 1;
     while (true) {
       BucketT *ThisBucket = BucketsPtr + BucketNo;
@@ -719,12 +700,12 @@ private:
       // prefer to return it than something that would require more probing.
       if (KeyInfoT::isEqual(ThisBucket->getFirst(), TombstoneKey) &&
           !FoundTombstone)
-        FoundTombstone = ThisBucket;  // Remember the first tombstone found.
+        FoundTombstone = ThisBucket; // Remember the first tombstone found.
 
       // Otherwise, it's a hash collision or a tombstone, continue quadratic
       // probing.
       BucketNo += ProbeAmt++;
-      BucketNo &= (NumBuckets-1);
+      BucketNo &= (NumBuckets - 1);
     }
   }
 
@@ -733,9 +714,7 @@ public:
   /// This is just the raw memory used by DenseMap.
   /// If entries are pointers to objects, the size of the referenced objects
   /// are not included.
-  size_t getMemorySize() const {
-    return getNumBuckets() * sizeof(BucketT);
-  }
+  size_t getMemorySize() const { return getNumBuckets() * sizeof(BucketT); }
 };
 
 /// Equality comparison for DenseMap.
@@ -803,8 +782,7 @@ public:
     swap(other);
   }
 
-  template<typename InputIt>
-  DenseMap(const InputIt &I, const InputIt &E) {
+  template <typename InputIt> DenseMap(const InputIt &I, const InputIt &E) {
     init(std::distance(I, E));
     this->insert(I, E);
   }
@@ -819,7 +797,7 @@ public:
     deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
   }
 
-  void swap(DenseMap& RHS) {
+  void swap(DenseMap &RHS) {
     this->incrementEpoch();
     RHS.incrementEpoch();
     std::swap(Buckets, RHS.Buckets);
@@ -828,13 +806,13 @@ public:
     std::swap(NumBuckets, RHS.NumBuckets);
   }
 
-  DenseMap& operator=(const DenseMap& other) {
+  DenseMap &operator=(const DenseMap &other) {
     if (&other != this)
       copyFrom(other);
     return *this;
   }
 
-  DenseMap& operator=(DenseMap &&other) {
+  DenseMap &operator=(DenseMap &&other) {
     this->destroyAll();
     deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
     init(0);
@@ -842,7 +820,7 @@ public:
     return *this;
   }
 
-  void copyFrom(const DenseMap& other) {
+  void copyFrom(const DenseMap &other) {
     this->destroyAll();
     deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
     if (allocateBuckets(other.NumBuckets)) {
@@ -867,14 +845,15 @@ public:
     unsigned OldNumBuckets = NumBuckets;
     BucketT *OldBuckets = Buckets;
 
-    allocateBuckets(std::max<unsigned>(64, static_cast<unsigned>(NextPowerOf2(AtLeast-1))));
+    allocateBuckets(std::max<unsigned>(
+        64, static_cast<unsigned>(NextPowerOf2(AtLeast - 1))));
     assert(Buckets);
     if (!OldBuckets) {
       this->BaseT::initEmpty();
       return;
     }
 
-    this->moveFromOldBuckets(OldBuckets, OldBuckets+OldNumBuckets);
+    this->moveFromOldBuckets(OldBuckets, OldBuckets + OldNumBuckets);
 
     // Free the old table.
     deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets,
@@ -901,29 +880,17 @@ public:
   }
 
 private:
-  unsigned getNumEntries() const {
-    return NumEntries;
-  }
+  unsigned getNumEntries() const { return NumEntries; }
 
-  void setNumEntries(unsigned Num) {
-    NumEntries = Num;
-  }
+  void setNumEntries(unsigned Num) { NumEntries = Num; }
 
-  unsigned getNumTombstones() const {
-    return NumTombstones;
-  }
+  unsigned getNumTombstones() const { return NumTombstones; }
 
-  void setNumTombstones(unsigned Num) {
-    NumTombstones = Num;
-  }
+  void setNumTombstones(unsigned Num) { NumTombstones = Num; }
 
-  BucketT *getBuckets() const {
-    return Buckets;
-  }
+  BucketT *getBuckets() const { return Buckets; }
 
-  unsigned getNumBuckets() const {
-    return NumBuckets;
-  }
+  unsigned getNumBuckets() const { return NumBuckets; }
 
   bool allocateBuckets(unsigned Num) {
     NumBuckets = Num;
@@ -984,7 +951,7 @@ public:
     swap(other);
   }
 
-  template<typename InputIt>
+  template <typename InputIt>
   SmallDenseMap(const InputIt &I, const InputIt &E) {
     init(NextPowerOf2(std::distance(I, E)));
     this->insert(I, E);
@@ -998,7 +965,7 @@ public:
     deallocateBuckets();
   }
 
-  void swap(SmallDenseMap& RHS) {
+  void swap(SmallDenseMap &RHS) {
     unsigned TmpNumEntries = RHS.NumEntries;
     RHS.NumEntries = NumEntries;
     NumEntries = TmpNumEntries;
@@ -1070,13 +1037,13 @@ public:
     new (SmallSide.getLargeRep()) LargeRep(std::move(TmpRep));
   }
 
-  SmallDenseMap& operator=(const SmallDenseMap& other) {
+  SmallDenseMap &operator=(const SmallDenseMap &other) {
     if (&other != this)
       copyFrom(other);
     return *this;
   }
 
-  SmallDenseMap& operator=(SmallDenseMap &&other) {
+  SmallDenseMap &operator=(SmallDenseMap &&other) {
     this->destroyAll();
     deallocateBuckets();
     init(0);
@@ -1084,7 +1051,7 @@ public:
     return *this;
   }
 
-  void copyFrom(const SmallDenseMap& other) {
+  void copyFrom(const SmallDenseMap &other) {
     this->destroyAll();
     deallocateBuckets();
     Small = true;
@@ -1106,7 +1073,7 @@ public:
 
   void grow(unsigned AtLeast) {
     if (AtLeast > InlineBuckets)
-      AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast-1));
+      AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1));
 
     if (Small) {
       // First move the inline buckets into a temporary storage.
@@ -1150,7 +1117,8 @@ public:
       new (getLargeRep()) LargeRep(allocateBuckets(AtLeast));
     }
 
-    this->moveFromOldBuckets(OldRep.Buckets, OldRep.Buckets+OldRep.NumBuckets);
+    this->moveFromOldBuckets(OldRep.Buckets,
+                             OldRep.Buckets + OldRep.NumBuckets);
 
     // Free the old table.
     deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
@@ -1179,9 +1147,7 @@ public:
   }
 
 private:
-  unsigned getNumEntries() const {
-    return NumEntries;
-  }
+  unsigned getNumEntries() const { return NumEntries; }
 
   void setNumEntries(unsigned Num) {
     // NumEntries is hardcoded to be 31 bits wide.
@@ -1189,13 +1155,9 @@ private:
     NumEntries = Num;
   }
 
-  unsigned getNumTombstones() const {
-    return NumTombstones;
-  }
+  unsigned getNumTombstones() const { return NumTombstones; }
 
-  void setNumTombstones(unsigned Num) {
-    NumTombstones = Num;
-  }
+  void setNumTombstones(unsigned Num) { NumTombstones = Num; }
 
   const BucketT *getInlineBuckets() const {
     assert(Small);
@@ -1207,7 +1169,7 @@ private:
 
   BucketT *getInlineBuckets() {
     return const_cast<BucketT *>(
-      const_cast<const SmallDenseMap *>(this)->getInlineBuckets());
+        const_cast<const SmallDenseMap *>(this)->getInlineBuckets());
   }
 
   const LargeRep *getLargeRep() const {
@@ -1218,7 +1180,7 @@ private:
 
   LargeRep *getLargeRep() {
     return const_cast<LargeRep *>(
-      const_cast<const SmallDenseMap *>(this)->getLargeRep());
+        const_cast<const SmallDenseMap *>(this)->getLargeRep());
   }
 
   const BucketT *getBuckets() const {
@@ -1227,7 +1189,7 @@ private:
 
   BucketT *getBuckets() {
     return const_cast<BucketT *>(
-      const_cast<const SmallDenseMap *>(this)->getBuckets());
+        const_cast<const SmallDenseMap *>(this)->getBuckets());
   }
 
   unsigned getNumBuckets() const {
@@ -1278,7 +1240,8 @@ public:
       : DebugEpochBase::HandleBase(&Epoch), Ptr(Pos), End(E) {
     assert(isHandleInSync() && "invalid construction!");
 
-    if (NoAdvance) return;
+    if (NoAdvance)
+      return;
     if (shouldReverseIterate<KeyT>()) {
       RetreatPastEmptyBuckets();
       return;
@@ -1324,7 +1287,7 @@ public:
     return !(LHS == RHS);
   }
 
-  inline DenseMapIterator& operator++() {  // Preincrement
+  inline DenseMapIterator &operator++() { // Preincrement
     assert(isHandleInSync() && "invalid iterator access!");
     assert(Ptr != End && "incrementing end() iterator");
     if (shouldReverseIterate<KeyT>()) {
@@ -1336,9 +1299,11 @@ public:
     AdvancePastEmptyBuckets();
     return *this;
   }
-  DenseMapIterator operator++(int) {  // Postincrement
+  DenseMapIterator operator++(int) { // Postincrement
     assert(isHandleInSync() && "invalid iterator access!");
-    DenseMapIterator tmp = *this; ++*this; return tmp;
+    DenseMapIterator tmp = *this;
+    ++*this;
+    return tmp;
   }
 
 private:
diff --git a/llvm/include/llvm/ADT/DenseSet.h b/llvm/include/llvm/ADT/DenseSet.h
index b89c886..a307bd8 100644
--- a/llvm/include/llvm/ADT/DenseSet.h
+++ b/llvm/include/llvm/ADT/DenseSet.h
@@ -89,18 +89,12 @@ public:
   /// before resizing again.
   void reserve(size_t Size) { TheMap.reserve(Size); }
 
-  void clear() {
-    TheMap.clear();
-  }
+  void clear() { TheMap.clear(); }
 
   /// Return 1 if the specified key is in the set, 0 otherwise.
-  size_type count(const_arg_type_t<ValueT> V) const {
-    return TheMap.count(V);
-  }
+  size_type count(const_arg_type_t<ValueT> V) const { return TheMap.count(V); }
 
-  bool erase(const ValueT &V) {
-    return TheMap.erase(V);
-  }
+  bool erase(const ValueT &V) { return TheMap.erase(V); }
 
   void swap(DenseSetImpl &RHS) { TheMap.swap(RHS.TheMap); }
 
@@ -128,8 +122,15 @@ public:
     ValueT *operator->() { return &I->getFirst(); }
     const ValueT *operator->() const { return &I->getFirst(); }
 
-    Iterator& operator++() { ++I; return *this; }
-    Iterator operator++(int) { auto T = *this; ++I; return T; }
+    Iterator &operator++() {
+      ++I;
+      return *this;
+    }
+    Iterator operator++(int) {
+      auto T = *this;
+      ++I;
+      return T;
+    }
     friend bool operator==(const Iterator &X, const Iterator &Y) {
       return X.I == Y.I;
     }
@@ -157,8 +158,15 @@ public:
     const ValueT &operator*() const { return I->getFirst(); }
     const ValueT *operator->() const { return &I->getFirst(); }
 
-    ConstIterator& operator++() { ++I; return *this; }
-    ConstIterator operator++(int) { auto T = *this; ++I; return T; }
+    ConstIterator &operator++() {
+      ++I;
+      return *this;
+    }
+    ConstIterator operator++(int) {
+      auto T = *this;
+      ++I;
+      return T;
+    }
     friend bool operator==(const ConstIterator &X, const ConstIterator &Y) {
       return X.I == Y.I;
     }
@@ -191,8 +199,7 @@ public:
   /// The DenseMapInfo is responsible for supplying methods
   /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key type
   /// used.
-  template <class LookupKeyT>
-  iterator find_as(const LookupKeyT &Val) {
+  template <class LookupKeyT> iterator find_as(const LookupKeyT &Val) {
     return Iterator(TheMap.find_as(Val));
   }
   template <class LookupKeyT>
@@ -226,8 +233,7 @@ public:
   }
 
   // Range insertion of values.
-  template<typename InputIt>
-  void insert(InputIt I, InputIt E) {
+  template <typename InputIt> void insert(InputIt I, InputIt E) {
     for (; I != E; ++I)
       insert(*I);
   }
@@ -266,8 +272,9 @@ bool operator!=(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
 /// Implements a dense probed hash-table based set.
 template <typename ValueT, typename ValueInfoT = DenseMapInfo<ValueT>>
 class DenseSet : public detail::DenseSetImpl<
-                     ValueT, DenseMap<ValueT, detail::DenseSetEmpty, ValueInfoT,
-                                      detail::DenseSetPair<ValueT>>,
+                     ValueT,
+                     DenseMap<ValueT, detail::DenseSetEmpty, ValueInfoT,
+                              detail::DenseSetPair<ValueT>>,
                      ValueInfoT> {
   using BaseT =
       detail::DenseSetImpl<ValueT,
@@ -285,12 +292,14 @@ template <typename ValueT, unsigned InlineBuckets = 4,
           typename ValueInfoT = DenseMapInfo<ValueT>>
 class SmallDenseSet
     : public detail::DenseSetImpl<
-          ValueT, SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets,
-                                ValueInfoT, detail::DenseSetPair<ValueT>>,
+          ValueT,
+          SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets,
+                        ValueInfoT, detail::DenseSetPair<ValueT>>,
           ValueInfoT> {
   using BaseT = detail::DenseSetImpl<
-      ValueT, SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets,
-                            ValueInfoT, detail::DenseSetPair<ValueT>>,
+      ValueT,
+      SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets, ValueInfoT,
+                    detail::DenseSetPair<ValueT>>,
       ValueInfoT>;
 
 public:
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index c5eff15..09fc34a 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -218,8 +218,8 @@ protected:
   inline unsigned getID() const;
 
   MemoryAccess(LLVMContext &C, unsigned Vty, DeleteValueTy DeleteValue,
-               BasicBlock *BB, unsigned NumOperands)
-      : DerivedUser(Type::getVoidTy(C), Vty, nullptr, NumOperands, DeleteValue),
+               BasicBlock *BB, AllocInfo AllocInfo)
+      : DerivedUser(Type::getVoidTy(C), Vty, AllocInfo, DeleteValue),
         Block(BB) {}
 
   // Use deleteValue() to delete a generic MemoryAccess.
@@ -280,8 +280,8 @@ protected:
 
   MemoryUseOrDef(LLVMContext &C, MemoryAccess *DMA, unsigned Vty,
                  DeleteValueTy DeleteValue, Instruction *MI, BasicBlock *BB,
-                 unsigned NumOperands)
-      : MemoryAccess(C, Vty, DeleteValue, BB, NumOperands),
+                 AllocInfo AllocInfo)
+      : MemoryAccess(C, Vty, DeleteValue, BB, AllocInfo),
         MemoryInstruction(MI) {
     setDefiningAccess(DMA);
   }
@@ -307,15 +307,16 @@ private:
 /// MemoryUse's is exactly the set of Instructions for which
 /// AliasAnalysis::getModRefInfo returns "Ref".
 class MemoryUse final : public MemoryUseOrDef {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
 public:
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess);
 
   MemoryUse(LLVMContext &C, MemoryAccess *DMA, Instruction *MI, BasicBlock *BB)
-      : MemoryUseOrDef(C, DMA, MemoryUseVal, deleteMe, MI, BB,
-                       /*NumOperands=*/1) {}
+      : MemoryUseOrDef(C, DMA, MemoryUseVal, deleteMe, MI, BB, AllocMarker) {}
 
   // allocate space for exactly one operand
-  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   static bool classof(const Value *MA) {
@@ -367,6 +368,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUse, MemoryAccess)
 /// associated with them. This use points to the nearest reaching
 /// MemoryDef/MemoryPhi.
 class MemoryDef final : public MemoryUseOrDef {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
 public:
   friend class MemorySSA;
 
@@ -374,12 +377,11 @@ public:
 
   MemoryDef(LLVMContext &C, MemoryAccess *DMA, Instruction *MI, BasicBlock *BB,
             unsigned Ver)
-      : MemoryUseOrDef(C, DMA, MemoryDefVal, deleteMe, MI, BB,
-                       /*NumOperands=*/2),
+      : MemoryUseOrDef(C, DMA, MemoryDefVal, deleteMe, MI, BB, AllocMarker),
         ID(Ver) {}
 
   // allocate space for exactly two operands
-  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   static bool classof(const Value *MA) {
@@ -474,8 +476,10 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUseOrDef, MemoryAccess)
 /// Because MemoryUse's do not generate new definitions, they do not have this
 /// issue.
 class MemoryPhi final : public MemoryAccess {
+  constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
   // allocate space for exactly zero operands
-  void *operator new(size_t S) { return User::operator new(S); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
 
 public:
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
@@ -484,7 +488,7 @@ public:
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess);
 
   MemoryPhi(LLVMContext &C, BasicBlock *BB, unsigned Ver, unsigned NumPreds = 0)
-      : MemoryAccess(C, MemoryPhiVal, deleteMe, BB, 0), ID(Ver),
+      : MemoryAccess(C, MemoryPhiVal, deleteMe, BB, AllocMarker), ID(Ver),
         ReservedSpace(NumPreds) {
     allocHungoffUses(ReservedSpace);
   }
diff --git a/llvm/include/llvm/IR/Constant.h b/llvm/include/llvm/IR/Constant.h
index a82e37b..0aefb5e 100644
--- a/llvm/include/llvm/IR/Constant.h
+++ b/llvm/include/llvm/IR/Constant.h
@@ -41,8 +41,8 @@ class APInt;
 /// LLVM Constant Representation
 class Constant : public User {
 protected:
-  Constant(Type *ty, ValueTy vty, Use *Ops, unsigned NumOps)
-    : User(ty, vty, Ops, NumOps) {}
+  Constant(Type *ty, ValueTy vty, AllocInfo AllocInfo)
+      : User(ty, vty, AllocInfo) {}
 
   ~Constant() = default;
 
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index 62ccde9..3b16aa0 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -51,6 +51,8 @@ template <class ConstantClass> struct ConstantAggrKeyType;
 /// Since they can be in use by unrelated modules (and are never based on
 /// GlobalValues), it never makes sense to RAUW them.
 class ConstantData : public Constant {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{0};
+
   friend class Constant;
 
   Value *handleOperandChangeImpl(Value *From, Value *To) {
@@ -58,9 +60,9 @@ class ConstantData : public Constant {
   }
 
 protected:
-  explicit ConstantData(Type *Ty, ValueTy VT) : Constant(Ty, VT, nullptr, 0) {}
+  explicit ConstantData(Type *Ty, ValueTy VT) : Constant(Ty, VT, AllocMarker) {}
 
-  void *operator new(size_t S) { return User::operator new(S, 0); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
 
 public:
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
@@ -399,7 +401,8 @@ public:
 /// use operands.
 class ConstantAggregate : public Constant {
 protected:
-  ConstantAggregate(Type *T, ValueTy VT, ArrayRef<Constant *> V);
+  ConstantAggregate(Type *T, ValueTy VT, ArrayRef<Constant *> V,
+                    AllocInfo AllocInfo);
 
 public:
   /// Transparently provide more efficient getOperand methods.
@@ -425,7 +428,7 @@ class ConstantArray final : public ConstantAggregate {
   friend struct ConstantAggrKeyType<ConstantArray>;
   friend class Constant;
 
-  ConstantArray(ArrayType *T, ArrayRef<Constant *> Val);
+  ConstantArray(ArrayType *T, ArrayRef<Constant *> Val, AllocInfo AllocInfo);
 
   void destroyConstantImpl();
   Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -457,7 +460,7 @@ class ConstantStruct final : public ConstantAggregate {
   friend struct ConstantAggrKeyType<ConstantStruct>;
   friend class Constant;
 
-  ConstantStruct(StructType *T, ArrayRef<Constant *> Val);
+  ConstantStruct(StructType *T, ArrayRef<Constant *> Val, AllocInfo AllocInfo);
 
   void destroyConstantImpl();
   Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -509,7 +512,7 @@ class ConstantVector final : public ConstantAggregate {
   friend struct ConstantAggrKeyType<ConstantVector>;
   friend class Constant;
 
-  ConstantVector(VectorType *T, ArrayRef<Constant *> Val);
+  ConstantVector(VectorType *T, ArrayRef<Constant *> Val, AllocInfo AllocInfo);
 
   void destroyConstantImpl();
   Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -890,9 +893,11 @@ public:
 class BlockAddress final : public Constant {
   friend class Constant;
 
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
   BlockAddress(Function *F, BasicBlock *BB);
 
-  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
 
   void destroyConstantImpl();
   Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -936,9 +941,11 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BlockAddress, Value)
 class DSOLocalEquivalent final : public Constant {
   friend class Constant;
 
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
   DSOLocalEquivalent(GlobalValue *GV);
 
-  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
 
   void destroyConstantImpl();
   Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -973,9 +980,11 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DSOLocalEquivalent, Value)
 class NoCFIValue final : public Constant {
   friend class Constant;
 
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
   NoCFIValue(GlobalValue *GV);
 
-  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
 
   void destroyConstantImpl();
   Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -1013,10 +1022,12 @@ class ConstantPtrAuth final : public Constant {
   friend struct ConstantPtrAuthKeyType;
   friend class Constant;
 
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{4};
+
   ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, ConstantInt *Disc,
                   Constant *AddrDisc);
 
-  void *operator new(size_t s) { return User::operator new(s, 4); }
+  void *operator new(size_t s) { return User::operator new(s, AllocMarker); }
 
   void destroyConstantImpl();
   Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -1102,8 +1113,8 @@ class ConstantExpr : public Constant {
   Value *handleOperandChangeImpl(Value *From, Value *To);
 
 protected:
-  ConstantExpr(Type *ty, unsigned Opcode, Use *Ops, unsigned NumOps)
-      : Constant(ty, ConstantExprVal, Ops, NumOps) {
+  ConstantExpr(Type *ty, unsigned Opcode, AllocInfo AllocInfo)
+      : Constant(ty, ConstantExprVal, AllocInfo) {
     // Operation type (an Instruction opcode) is stored as the SubclassData.
     setValueSubclassData(Opcode);
   }
diff --git a/llvm/include/llvm/IR/DerivedUser.h b/llvm/include/llvm/IR/DerivedUser.h
index a25d316..a307315 100644
--- a/llvm/include/llvm/IR/DerivedUser.h
+++ b/llvm/include/llvm/IR/DerivedUser.h
@@ -34,9 +34,9 @@ private:
   DeleteValueTy DeleteValue;
 
 public:
-  DerivedUser(Type *Ty, unsigned VK, Use *U, unsigned NumOps,
+  DerivedUser(Type *Ty, unsigned VK, AllocInfo AllocInfo,
               DeleteValueTy DeleteValue)
-      : User(Ty, VK, U, NumOps), DeleteValue(DeleteValue) {}
+      : User(Ty, VK, AllocInfo), DeleteValue(DeleteValue) {}
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index f7e4e97..866c68d 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -72,6 +72,8 @@ public:
   using const_arg_iterator = const Argument *;
 
 private:
+  constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
   // Important things that make up a function!
   BasicBlockListType BasicBlocks;         ///< The basic blocks
 
@@ -171,13 +173,14 @@ public:
   static Function *Create(FunctionType *Ty, LinkageTypes Linkage,
                           unsigned AddrSpace, const Twine &N = "",
                           Module *M = nullptr) {
-    return new Function(Ty, Linkage, AddrSpace, N, M);
+    return new (AllocMarker) Function(Ty, Linkage, AddrSpace, N, M);
   }
 
   // TODO: remove this once all users have been updated to pass an AddrSpace
   static Function *Create(FunctionType *Ty, LinkageTypes Linkage,
                           const Twine &N = "", Module *M = nullptr) {
-    return new Function(Ty, Linkage, static_cast<unsigned>(-1), N, M);
+    return new (AllocMarker)
+        Function(Ty, Linkage, static_cast<unsigned>(-1), N, M);
   }
 
   /// Creates a new function and attaches it to a module.
diff --git a/llvm/include/llvm/IR/GlobalAlias.h b/llvm/include/llvm/IR/GlobalAlias.h
index 583d66e..3db6984 100644
--- a/llvm/include/llvm/IR/GlobalAlias.h
+++ b/llvm/include/llvm/IR/GlobalAlias.h
@@ -28,6 +28,8 @@ template <typename ValueSubClass, typename... Args> class SymbolTableListTraits;
 class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
   friend class SymbolTableListTraits<GlobalAlias>;
 
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
   GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage,
               const Twine &Name, Constant *Aliasee, Module *Parent);
 
@@ -59,7 +61,7 @@ public:
   static GlobalAlias *create(const Twine &Name, GlobalValue *Aliasee);
 
   // allocate space for exactly one operand
-  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Provide fast operand accessors
diff --git a/llvm/include/llvm/IR/GlobalIFunc.h b/llvm/include/llvm/IR/GlobalIFunc.h
index 8935284..0d2f152 100644
--- a/llvm/include/llvm/IR/GlobalIFunc.h
+++ b/llvm/include/llvm/IR/GlobalIFunc.h
@@ -34,6 +34,8 @@ template <typename ValueSubClass, typename... Args> class SymbolTableListTraits;
 class GlobalIFunc final : public GlobalObject, public ilist_node<GlobalIFunc> {
   friend class SymbolTableListTraits<GlobalIFunc>;
 
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
   GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage,
               const Twine &Name, Constant *Resolver, Module *Parent);
 
@@ -48,7 +50,7 @@ public:
                              Constant *Resolver, Module *Parent);
 
   // allocate space for exactly one operand
-  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Provide fast operand accessors
diff --git a/llvm/include/llvm/IR/GlobalObject.h b/llvm/include/llvm/IR/GlobalObject.h
index b6a974d..08edc13 100644
--- a/llvm/include/llvm/IR/GlobalObject.h
+++ b/llvm/include/llvm/IR/GlobalObject.h
@@ -40,10 +40,9 @@ public:
   };
 
 protected:
-  GlobalObject(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
-               LinkageTypes Linkage, const Twine &Name,
-               unsigned AddressSpace = 0)
-      : GlobalValue(Ty, VTy, Ops, NumOps, Linkage, Name, AddressSpace) {
+  GlobalObject(Type *Ty, ValueTy VTy, AllocInfo AllocInfo, LinkageTypes Linkage,
+               const Twine &Name, unsigned AddressSpace = 0)
+      : GlobalValue(Ty, VTy, AllocInfo, Linkage, Name, AddressSpace) {
     setGlobalValueSubClassData(0);
   }
   ~GlobalObject();
diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h
index 53eddeb..d9104d7 100644
--- a/llvm/include/llvm/IR/GlobalValue.h
+++ b/llvm/include/llvm/IR/GlobalValue.h
@@ -77,9 +77,9 @@ public:
   };
 
 protected:
-  GlobalValue(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
-              LinkageTypes Linkage, const Twine &Name, unsigned AddressSpace)
-      : Constant(PointerType::get(Ty, AddressSpace), VTy, Ops, NumOps),
+  GlobalValue(Type *Ty, ValueTy VTy, AllocInfo AllocInfo, LinkageTypes Linkage,
+              const Twine &Name, unsigned AddressSpace)
+      : Constant(PointerType::get(Ty, AddressSpace), VTy, AllocInfo),
         ValueType(Ty), Visibility(DefaultVisibility),
         UnnamedAddrVal(unsigned(UnnamedAddr::None)),
         DllStorageClass(DefaultStorageClass), ThreadLocal(NotThreadLocal),
diff --git a/llvm/include/llvm/IR/GlobalVariable.h b/llvm/include/llvm/IR/GlobalVariable.h
index 0736c300..83e4848 100644
--- a/llvm/include/llvm/IR/GlobalVariable.h
+++ b/llvm/include/llvm/IR/GlobalVariable.h
@@ -39,6 +39,8 @@ class DIGlobalVariableExpression;
 class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
   friend class SymbolTableListTraits<GlobalVariable>;
 
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
   AttributeSet Attrs;
 
   // Is this a global constant?
@@ -70,24 +72,31 @@ public:
   GlobalVariable(const GlobalVariable &) = delete;
   GlobalVariable &operator=(const GlobalVariable &) = delete;
 
+private:
+  /// Set the number of operands on a GlobalVariable.
+  ///
+  /// GlobalVariable always allocates space for a single operands, but
+  /// doesn't always use it.
+  void setGlobalVariableNumOperands(unsigned NumOps) {
+    assert(NumOps <= 1 && "GlobalVariable can only have 0 or 1 operands");
+    NumUserOperands = NumOps;
+  }
+
+public:
   ~GlobalVariable() {
     dropAllReferences();
+
+    // Number of operands can be set to 0 after construction and initialization.
+    // Make sure that number of operands is reset to 1, as this is needed in
+    // User::operator delete
+    setGlobalVariableNumOperands(1);
   }
 
   // allocate space for exactly one operand
-  void *operator new(size_t s) {
-    return User::operator new(s, 1);
-  }
+  void *operator new(size_t s) { return User::operator new(s, AllocMarker); }
 
   // delete space for exactly one operand as created in the corresponding new operator
-  void operator delete(void *ptr){
-    assert(ptr != nullptr && "must not be nullptr");
-    User *Obj = static_cast<User *>(ptr);
-    // Number of operands can be set to 0 after construction and initialization. Make sure
-    // that number of operands is reset to 1, as this is needed in User::operator delete
-    Obj->setGlobalVariableNumOperands(1);
-    User::operator delete(Obj);
-  }
+  void operator delete(void *ptr) { User::operator delete(ptr); }
 
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 5ed3ec4..4720533 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -55,24 +55,26 @@ typedef unsigned ID;
 //===----------------------------------------------------------------------===//
 
 class UnaryInstruction : public Instruction {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
 protected:
   UnaryInstruction(Type *Ty, unsigned iType, Value *V, BasicBlock::iterator IB)
-      : Instruction(Ty, iType, &Op<0>(), 1, IB) {
+      : Instruction(Ty, iType, AllocMarker, IB) {
     Op<0>() = V;
   }
   UnaryInstruction(Type *Ty, unsigned iType, Value *V,
                    Instruction *IB = nullptr)
-    : Instruction(Ty, iType, &Op<0>(), 1, IB) {
+      : Instruction(Ty, iType, AllocMarker, IB) {
     Op<0>() = V;
   }
   UnaryInstruction(Type *Ty, unsigned iType, Value *V, BasicBlock *IAE)
-    : Instruction(Ty, iType, &Op<0>(), 1, IAE) {
+      : Instruction(Ty, iType, AllocMarker, IAE) {
     Op<0>() = V;
   }
 
 public:
   // allocate space for exactly one operand
-  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
@@ -186,6 +188,8 @@ public:
 //===----------------------------------------------------------------------===//
 
 class BinaryOperator : public Instruction {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
   void AssertOK();
 
 protected:
@@ -199,7 +203,7 @@ protected:
 
 public:
   // allocate space for exactly two operands
-  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
@@ -745,6 +749,8 @@ public:
 /// This class is the base class for the comparison instructions.
 /// Abstract base class of comparison instructions.
 class CmpInst : public Instruction {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
 public:
   /// This enumeration lists the possible predicates for CmpInst subclasses.
   /// Values in the range 0-31 are reserved for FCmpInst, while values in the
@@ -814,7 +820,7 @@ protected:
 
 public:
   // allocate space for exactly two operands
-  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Construct a compare instruction, given the opcode, the predicate and
@@ -2416,10 +2422,10 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CallBase, Value)
 //===----------------------------------------------------------------------===//
 class FuncletPadInst : public Instruction {
 private:
-  FuncletPadInst(const FuncletPadInst &CPI);
+  FuncletPadInst(const FuncletPadInst &CPI, AllocInfo AllocInfo);
 
   explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
-                          ArrayRef<Value *> Args, unsigned Values,
+                          ArrayRef<Value *> Args, AllocInfo AllocInfo,
                           const Twine &NameStr, InsertPosition InsertBefore);
 
   void init(Value *ParentPad, ArrayRef<Value *> Args, const Twine &NameStr);
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index c275723..a12d5d9 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -1030,7 +1030,7 @@ protected:
     setValueSubclassData(Storage);
   }
 
-  Instruction(Type *Ty, unsigned iType, Use *Ops, unsigned NumOps,
+  Instruction(Type *Ty, unsigned iType, AllocInfo AllocInfo,
               InsertPosition InsertBefore = nullptr);
 
 private:
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index ab3321e..e89739a 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -297,6 +297,8 @@ class StoreInst : public Instruction {
 
   void AssertOK();
 
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
@@ -314,7 +316,7 @@ public:
             InsertPosition InsertBefore = nullptr);
 
   // allocate space for exactly two operands
-  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Return true if this is a store to a volatile memory location.
@@ -420,6 +422,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(StoreInst, Value)
 class FenceInst : public Instruction {
   using OrderingField = AtomicOrderingBitfieldElementT<0>;
 
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{0};
+
   void Init(AtomicOrdering Ordering, SyncScope::ID SSID);
 
 protected:
@@ -436,7 +440,7 @@ public:
             InsertPosition InsertBefore = nullptr);
 
   // allocate space for exactly zero operands
-  void *operator new(size_t S) { return User::operator new(S, 0); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Returns the ordering constraint of this fence instruction.
@@ -502,6 +506,8 @@ class AtomicCmpXchgInst : public Instruction {
       typename Bitfield::Element<AtomicOrdering, Offset, 3,
                                  AtomicOrdering::LAST>;
 
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{3};
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
@@ -515,7 +521,7 @@ public:
                     InsertPosition InsertBefore = nullptr);
 
   // allocate space for exactly three operands
-  void *operator new(size_t S) { return User::operator new(S, 3); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   using VolatileField = BoolBitfieldElementT<0>;
@@ -774,13 +780,15 @@ private:
   using BinOpBitfieldElement =
       typename Bitfield::Element<BinOp, Offset, 5, BinOp::LAST_BINOP>;
 
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
 public:
   AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val, Align Alignment,
                 AtomicOrdering Ordering, SyncScope::ID SSID,
                 InsertPosition InsertBefore = nullptr);
 
   // allocate space for exactly two operands
-  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   using VolatileField = BoolBitfieldElementT<0>;
@@ -924,14 +932,14 @@ class GetElementPtrInst : public Instruction {
   Type *SourceElementType;
   Type *ResultElementType;
 
-  GetElementPtrInst(const GetElementPtrInst &GEPI);
+  GetElementPtrInst(const GetElementPtrInst &GEPI, AllocInfo AllocInfo);
 
   /// Constructors - Create a getelementptr instruction with a base pointer an
   /// list of indices. The first and second ctor can optionally insert before an
   /// existing instruction, the third appends the new instruction to the
   /// specified BasicBlock.
   inline GetElementPtrInst(Type *PointeeType, Value *Ptr,
-                           ArrayRef<Value *> IdxList, unsigned Values,
+                           ArrayRef<Value *> IdxList, AllocInfo AllocInfo,
                            const Twine &NameStr, InsertPosition InsertBefore);
 
   void init(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &NameStr);
@@ -949,8 +957,9 @@ public:
                                    InsertPosition InsertBefore = nullptr) {
     unsigned Values = 1 + unsigned(IdxList.size());
     assert(PointeeType && "Must specify element type");
-    return new (Values) GetElementPtrInst(PointeeType, Ptr, IdxList, Values,
-                                          NameStr, InsertBefore);
+    IntrusiveOperandsAllocMarker AllocMarker{Values};
+    return new (AllocMarker) GetElementPtrInst(
+        PointeeType, Ptr, IdxList, AllocMarker, NameStr, InsertBefore);
   }
 
   static GetElementPtrInst *Create(Type *PointeeType, Value *Ptr,
@@ -1124,12 +1133,11 @@ struct OperandTraits<GetElementPtrInst>
     : public VariadicOperandTraits<GetElementPtrInst> {};
 
 GetElementPtrInst::GetElementPtrInst(Type *PointeeType, Value *Ptr,
-                                     ArrayRef<Value *> IdxList, unsigned Values,
-                                     const Twine &NameStr,
+                                     ArrayRef<Value *> IdxList,
+                                     AllocInfo AllocInfo, const Twine &NameStr,
                                      InsertPosition InsertBefore)
-    : Instruction(getGEPReturnType(Ptr, IdxList), GetElementPtr,
-                  OperandTraits<GetElementPtrInst>::op_end(this) - Values,
-                  Values, InsertBefore),
+    : Instruction(getGEPReturnType(Ptr, IdxList), GetElementPtr, AllocInfo,
+                  InsertBefore),
       SourceElementType(PointeeType),
       ResultElementType(getIndexedType(PointeeType, IdxList)) {
   init(Ptr, IdxList, NameStr);
@@ -1403,26 +1411,29 @@ public:
 /// hold the calling convention of the call.
 ///
 class CallInst : public CallBase {
-  CallInst(const CallInst &CI);
+  CallInst(const CallInst &CI, AllocInfo AllocInfo);
 
   /// Construct a CallInst from a range of arguments
   inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                   ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
-                  InsertPosition InsertBefore);
+                  AllocInfo AllocInfo, InsertPosition InsertBefore);
 
   inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
-                  const Twine &NameStr, InsertPosition InsertBefore)
-      : CallInst(Ty, Func, Args, std::nullopt, NameStr, InsertBefore) {}
+                  const Twine &NameStr, AllocInfo AllocInfo,
+                  InsertPosition InsertBefore)
+      : CallInst(Ty, Func, Args, std::nullopt, NameStr, AllocInfo,
+                 InsertBefore) {}
 
   explicit CallInst(FunctionType *Ty, Value *F, const Twine &NameStr,
-                    InsertPosition InsertBefore);
+                    AllocInfo AllocInfo, InsertPosition InsertBefore);
 
   void init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
             ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
   void init(FunctionType *FTy, Value *Func, const Twine &NameStr);
 
   /// Compute the number of operands to allocate.
-  static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) {
+  static unsigned ComputeNumOperands(unsigned NumArgs,
+                                     unsigned NumBundleInputs = 0) {
     // We need one operand for the called function, plus the input operand
     // counts provided.
     return 1 + NumArgs + NumBundleInputs;
@@ -1437,26 +1448,29 @@ protected:
 public:
   static CallInst *Create(FunctionType *Ty, Value *F, const Twine &NameStr = "",
                           InsertPosition InsertBefore = nullptr) {
-    return new (ComputeNumOperands(0)) CallInst(Ty, F, NameStr, InsertBefore);
+    IntrusiveOperandsAllocMarker AllocMarker{ComputeNumOperands(0)};
+    return new (AllocMarker)
+        CallInst(Ty, F, NameStr, AllocMarker, InsertBefore);
   }
 
   static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                           const Twine &NameStr,
                           InsertPosition InsertBefore = nullptr) {
-    return new (ComputeNumOperands(Args.size()))
-        CallInst(Ty, Func, Args, std::nullopt, NameStr, InsertBefore);
+    IntrusiveOperandsAllocMarker AllocMarker{ComputeNumOperands(Args.size())};
+    return new (AllocMarker) CallInst(Ty, Func, Args, std::nullopt, NameStr,
+                                      AllocMarker, InsertBefore);
   }
 
   static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                           ArrayRef<OperandBundleDef> Bundles = std::nullopt,
                           const Twine &NameStr = "",
                           InsertPosition InsertBefore = nullptr) {
-    const int NumOperands =
-        ComputeNumOperands(Args.size(), CountBundleInputs(Bundles));
-    const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+    IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+        ComputeNumOperands(unsigned(Args.size()), CountBundleInputs(Bundles)),
+        unsigned(Bundles.size() * sizeof(BundleOpInfo))};
 
-    return new (NumOperands, DescriptorBytes)
-        CallInst(Ty, Func, Args, Bundles, NameStr, InsertBefore);
+    return new (AllocMarker)
+        CallInst(Ty, Func, Args, Bundles, NameStr, AllocMarker, InsertBefore);
   }
 
   static CallInst *Create(FunctionCallee Func, const Twine &NameStr = "",
@@ -1561,12 +1575,11 @@ private:
 
 CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                    ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
-                   InsertPosition InsertBefore)
-    : CallBase(Ty->getReturnType(), Instruction::Call,
-               OperandTraits<CallBase>::op_end(this) -
-                   (Args.size() + CountBundleInputs(Bundles) + 1),
-               unsigned(Args.size() + CountBundleInputs(Bundles) + 1),
+                   AllocInfo AllocInfo, InsertPosition InsertBefore)
+    : CallBase(Ty->getReturnType(), Instruction::Call, AllocInfo,
                InsertBefore) {
+  assert(AllocInfo.NumOps ==
+         unsigned(Args.size() + CountBundleInputs(Bundles) + 1));
   init(Ty, Func, Args, Bundles, NameStr);
 }
 
@@ -1577,10 +1590,11 @@ CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
 /// This class represents the LLVM 'select' instruction.
 ///
 class SelectInst : public Instruction {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{3};
 
   SelectInst(Value *C, Value *S1, Value *S2, const Twine &NameStr,
              InsertPosition InsertBefore)
-      : Instruction(S1->getType(), Instruction::Select, &Op<0>(), 3,
+      : Instruction(S1->getType(), Instruction::Select, AllocMarker,
                     InsertBefore) {
     init(C, S1, S2);
     setName(NameStr);
@@ -1604,7 +1618,8 @@ public:
                             const Twine &NameStr = "",
                             InsertPosition InsertBefore = nullptr,
                             Instruction *MDFrom = nullptr) {
-    SelectInst *Sel = new(3) SelectInst(C, S1, S2, NameStr, InsertBefore);
+    SelectInst *Sel =
+        new (AllocMarker) SelectInst(C, S1, S2, NameStr, InsertBefore);
     if (MDFrom)
       Sel->copyMetadata(*MDFrom);
     return Sel;
@@ -1693,6 +1708,8 @@ public:
 /// element from a VectorType value
 ///
 class ExtractElementInst : public Instruction {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
   ExtractElementInst(Value *Vec, Value *Idx, const Twine &NameStr = "",
                      InsertPosition InsertBefore = nullptr);
 
@@ -1706,7 +1723,8 @@ public:
   static ExtractElementInst *Create(Value *Vec, Value *Idx,
                                     const Twine &NameStr = "",
                                     InsertPosition InsertBefore = nullptr) {
-    return new(2) ExtractElementInst(Vec, Idx, NameStr, InsertBefore);
+    return new (AllocMarker)
+        ExtractElementInst(Vec, Idx, NameStr, InsertBefore);
   }
 
   /// Return true if an extractelement instruction can be
@@ -1749,6 +1767,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ExtractElementInst, Value)
 /// element into a VectorType value
 ///
 class InsertElementInst : public Instruction {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{3};
+
   InsertElementInst(Value *Vec, Value *NewElt, Value *Idx,
                     const Twine &NameStr = "",
                     InsertPosition InsertBefore = nullptr);
@@ -1763,7 +1783,8 @@ public:
   static InsertElementInst *Create(Value *Vec, Value *NewElt, Value *Idx,
                                    const Twine &NameStr = "",
                                    InsertPosition InsertBefore = nullptr) {
-    return new(3) InsertElementInst(Vec, NewElt, Idx, NameStr, InsertBefore);
+    return new (AllocMarker)
+        InsertElementInst(Vec, NewElt, Idx, NameStr, InsertBefore);
   }
 
   /// Return true if an insertelement instruction can be
@@ -1813,6 +1834,8 @@ constexpr int PoisonMaskElem = -1;
 /// For scalable vectors, all the elements of the mask must be 0 or -1. This
 /// requirement may be relaxed in the future.
 class ShuffleVectorInst : public Instruction {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
   SmallVector<int, 4> ShuffleMask;
   Constant *ShuffleMaskForBitcode;
 
@@ -1834,7 +1857,7 @@ public:
                     const Twine &NameStr = "",
                     InsertPosition InsertBefore = nullptr);
 
-  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { return User::operator delete(Ptr); }
 
   /// Swap the operands and adjust the mask to preserve the semantics
@@ -2395,6 +2418,8 @@ ExtractValueInst::ExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
 /// value into an aggregate value.
 ///
 class InsertValueInst : public Instruction {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
   SmallVector<unsigned, 4> Indices;
 
   InsertValueInst(const InsertValueInst &IVI);
@@ -2423,7 +2448,7 @@ protected:
 
 public:
   // allocate space for exactly two operands
-  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   static InsertValueInst *Create(Value *Agg, Value *Val,
@@ -2493,9 +2518,7 @@ struct OperandTraits<InsertValueInst> :
 InsertValueInst::InsertValueInst(Value *Agg, Value *Val,
                                  ArrayRef<unsigned> Idxs, const Twine &NameStr,
                                  InsertPosition InsertBefore)
-    : Instruction(Agg->getType(), InsertValue,
-                  OperandTraits<InsertValueInst>::op_begin(this), 2,
-                  InsertBefore) {
+    : Instruction(Agg->getType(), InsertValue, AllocMarker, InsertBefore) {
   init(Agg, Val, Idxs, NameStr);
 }
 
@@ -2510,6 +2533,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertValueInst, Value)
 // scientist's overactive imagination.
 //
 class PHINode : public Instruction {
+  constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
   /// The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
   unsigned ReservedSpace;
@@ -2519,7 +2544,7 @@ class PHINode : public Instruction {
   explicit PHINode(Type *Ty, unsigned NumReservedValues,
                    const Twine &NameStr = "",
                    InsertPosition InsertBefore = nullptr)
-      : Instruction(Ty, Instruction::PHI, nullptr, 0, InsertBefore),
+      : Instruction(Ty, Instruction::PHI, AllocMarker, InsertBefore),
         ReservedSpace(NumReservedValues) {
     assert(!Ty->isTokenTy() && "PHI nodes cannot have token type!");
     setName(NameStr);
@@ -2545,7 +2570,8 @@ public:
   static PHINode *Create(Type *Ty, unsigned NumReservedValues,
                          const Twine &NameStr = "",
                          InsertPosition InsertBefore = nullptr) {
-    return new PHINode(Ty, NumReservedValues, NameStr, InsertBefore);
+    return new (AllocMarker)
+        PHINode(Ty, NumReservedValues, NameStr, InsertBefore);
   }
 
   /// Provide fast operand accessors
@@ -2749,6 +2775,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(PHINode, Value)
 class LandingPadInst : public Instruction {
   using CleanupField = BoolBitfieldElementT<0>;
 
+  constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
   /// The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
   unsigned ReservedSpace;
@@ -2763,7 +2791,7 @@ private:
                           const Twine &NameStr, InsertPosition InsertBefore);
 
   // Allocate space for exactly zero operands.
-  void *operator new(size_t S) { return User::operator new(S); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
 
   void growOperands(unsigned Size);
   void init(unsigned NumReservedValues, const Twine &NameStr);
@@ -2843,7 +2871,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(LandingPadInst, Value)
 /// does not continue in this function any longer.
 ///
 class ReturnInst : public Instruction {
-  ReturnInst(const ReturnInst &RI);
+  ReturnInst(const ReturnInst &RI, AllocInfo AllocInfo);
 
 private:
   // ReturnInst constructors:
@@ -2859,8 +2887,8 @@ private:
   //
   // NOTE: If the Value* passed is of type void then the constructor behaves as
   // if it was passed NULL.
-  explicit ReturnInst(LLVMContext &C, Value *retVal = nullptr,
-                      InsertPosition InsertBefore = nullptr);
+  explicit ReturnInst(LLVMContext &C, Value *retVal, AllocInfo AllocInfo,
+                      InsertPosition InsertBefore);
 
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
@@ -2871,11 +2899,13 @@ protected:
 public:
   static ReturnInst *Create(LLVMContext &C, Value *retVal = nullptr,
                             InsertPosition InsertBefore = nullptr) {
-    return new(!!retVal) ReturnInst(C, retVal, InsertBefore);
+    IntrusiveOperandsAllocMarker AllocMarker{retVal ? 1U : 0U};
+    return new (AllocMarker) ReturnInst(C, retVal, AllocMarker, InsertBefore);
   }
 
   static ReturnInst *Create(LLVMContext &C, BasicBlock *InsertAtEnd) {
-    return new (0) ReturnInst(C, nullptr, InsertAtEnd);
+    IntrusiveOperandsAllocMarker AllocMarker{0};
+    return new (AllocMarker) ReturnInst(C, nullptr, AllocMarker, InsertAtEnd);
   }
 
   /// Provide fast operand accessors
@@ -2923,7 +2953,7 @@ class BranchInst : public Instruction {
   ///  [Cond, FalseDest,] TrueDest.  This makes some accessors faster because
   /// they don't have to check for cond/uncond branchness. These are mostly
   /// accessed relative from op_end().
-  BranchInst(const BranchInst &BI);
+  BranchInst(const BranchInst &BI, AllocInfo AllocInfo);
   // BranchInst constructors (where {B, T, F} are blocks, and C is a condition):
   // BranchInst(BB *B)                           - 'br B'
   // BranchInst(BB* T, BB *F, Value *C)          - 'br C, T, F'
@@ -2933,10 +2963,10 @@ class BranchInst : public Instruction {
   // BranchInst(BB* T, BB *F, Value *C, Inst *I) - 'br C, T, F', insert before I
   // BranchInst(BB* B, BB *I)                    - 'br B'        insert at end
   // BranchInst(BB* T, BB *F, Value *C, BB *I)   - 'br C, T, F', insert at end
-  explicit BranchInst(BasicBlock *IfTrue,
-                      InsertPosition InsertBefore = nullptr);
+  explicit BranchInst(BasicBlock *IfTrue, AllocInfo AllocInfo,
+                      InsertPosition InsertBefore);
   BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
-             InsertPosition InsertBefore = nullptr);
+             AllocInfo AllocInfo, InsertPosition InsertBefore);
 
   void AssertOK();
 
@@ -2976,13 +3006,16 @@ public:
 
   static BranchInst *Create(BasicBlock *IfTrue,
                             InsertPosition InsertBefore = nullptr) {
-    return new(1) BranchInst(IfTrue, InsertBefore);
+    IntrusiveOperandsAllocMarker AllocMarker{1};
+    return new (AllocMarker) BranchInst(IfTrue, AllocMarker, InsertBefore);
   }
 
   static BranchInst *Create(BasicBlock *IfTrue, BasicBlock *IfFalse,
                             Value *Cond,
                             InsertPosition InsertBefore = nullptr) {
-    return new(3) BranchInst(IfTrue, IfFalse, Cond, InsertBefore);
+    IntrusiveOperandsAllocMarker AllocMarker{3};
+    return new (AllocMarker)
+        BranchInst(IfTrue, IfFalse, Cond, AllocMarker, InsertBefore);
   }
 
   /// Transparently provide more efficient getOperand methods.
@@ -3054,6 +3087,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
 /// Multiway switch
 ///
 class SwitchInst : public Instruction {
+  constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
   unsigned ReservedSpace;
 
   // Operand[0]    = Value to switch on
@@ -3070,7 +3105,7 @@ class SwitchInst : public Instruction {
              InsertPosition InsertBefore);
 
   // allocate space for exactly zero operands
-  void *operator new(size_t S) { return User::operator new(S); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
 
   void init(Value *Value, BasicBlock *Default, unsigned NumReserved);
   void growOperands();
@@ -3442,6 +3477,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SwitchInst, Value)
 /// Indirect Branch Instruction.
 ///
 class IndirectBrInst : public Instruction {
+  constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
   unsigned ReservedSpace;
 
   // Operand[0]   = Address to jump to
@@ -3456,7 +3493,7 @@ class IndirectBrInst : public Instruction {
                  InsertPosition InsertBefore);
 
   // allocate space for exactly zero operands
-  void *operator new(size_t S) { return User::operator new(S); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
 
   void init(Value *Address, unsigned NumDests);
   void growOperands();
@@ -3576,14 +3613,14 @@ class InvokeInst : public CallBase {
   /// The index from the end of the operand array to the unwind destination.
   static constexpr int UnwindDestOpEndIdx = -2;
 
-  InvokeInst(const InvokeInst &BI);
+  InvokeInst(const InvokeInst &BI, AllocInfo AllocInfo);
 
   /// Construct an InvokeInst given a range of arguments.
   ///
   /// Construct an InvokeInst from a range of arguments
   inline InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
                     BasicBlock *IfException, ArrayRef<Value *> Args,
-                    ArrayRef<OperandBundleDef> Bundles, int NumOperands,
+                    ArrayRef<OperandBundleDef> Bundles, AllocInfo AllocInfo,
                     const Twine &NameStr, InsertPosition InsertBefore);
 
   void init(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
@@ -3591,10 +3628,11 @@ class InvokeInst : public CallBase {
             ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
 
   /// Compute the number of operands to allocate.
-  static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) {
+  static unsigned ComputeNumOperands(unsigned NumArgs,
+                                     size_t NumBundleInputs = 0) {
     // We need one operand for the called function, plus our extra operands and
     // the input operand counts provided.
-    return 1 + NumExtraOperands + NumArgs + NumBundleInputs;
+    return 1 + NumExtraOperands + NumArgs + unsigned(NumBundleInputs);
   }
 
 protected:
@@ -3608,10 +3646,11 @@ public:
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             const Twine &NameStr,
                             InsertPosition InsertBefore = nullptr) {
-    int NumOperands = ComputeNumOperands(Args.size());
-    return new (NumOperands)
+    IntrusiveOperandsAllocMarker AllocMarker{
+        ComputeNumOperands(unsigned(Args.size()))};
+    return new (AllocMarker)
         InvokeInst(Ty, Func, IfNormal, IfException, Args, std::nullopt,
-                   NumOperands, NameStr, InsertBefore);
+                   AllocMarker, NameStr, InsertBefore);
   }
 
   static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
@@ -3619,12 +3658,12 @@ public:
                             ArrayRef<OperandBundleDef> Bundles = std::nullopt,
                             const Twine &NameStr = "",
                             InsertPosition InsertBefore = nullptr) {
-    int NumOperands =
-        ComputeNumOperands(Args.size(), CountBundleInputs(Bundles));
-    unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+    IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+        ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)),
+        unsigned(Bundles.size() * sizeof(BundleOpInfo))};
 
-    return new (NumOperands, DescriptorBytes)
-        InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, NumOperands,
+    return new (AllocMarker)
+        InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, AllocMarker,
                    NameStr, InsertBefore);
   }
 
@@ -3709,10 +3748,9 @@ private:
 
 InvokeInst::InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
                        BasicBlock *IfException, ArrayRef<Value *> Args,
-                       ArrayRef<OperandBundleDef> Bundles, int NumOperands,
+                       ArrayRef<OperandBundleDef> Bundles, AllocInfo AllocInfo,
                        const Twine &NameStr, InsertPosition InsertBefore)
-    : CallBase(Ty->getReturnType(), Instruction::Invoke,
-               OperandTraits<CallBase>::op_end(this) - NumOperands, NumOperands,
+    : CallBase(Ty->getReturnType(), Instruction::Invoke, AllocInfo,
                InsertBefore) {
   init(Ty, Func, IfNormal, IfException, Args, Bundles, NameStr);
 }
@@ -3729,7 +3767,7 @@ class CallBrInst : public CallBase {
 
   unsigned NumIndirectDests;
 
-  CallBrInst(const CallBrInst &BI);
+  CallBrInst(const CallBrInst &BI, AllocInfo AllocInfo);
 
   /// Construct a CallBrInst given a range of arguments.
   ///
@@ -3737,7 +3775,7 @@ class CallBrInst : public CallBase {
   inline CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest,
                     ArrayRef<BasicBlock *> IndirectDests,
                     ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles,
-                    int NumOperands, const Twine &NameStr,
+                    AllocInfo AllocInfo, const Twine &NameStr,
                     InsertPosition InsertBefore);
 
   void init(FunctionType *FTy, Value *Func, BasicBlock *DefaultDest,
@@ -3745,11 +3783,11 @@ class CallBrInst : public CallBase {
             ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
 
   /// Compute the number of operands to allocate.
-  static int ComputeNumOperands(int NumArgs, int NumIndirectDests,
-                                int NumBundleInputs = 0) {
+  static unsigned ComputeNumOperands(int NumArgs, int NumIndirectDests,
+                                     int NumBundleInputs = 0) {
     // We need one operand for the called function, plus our extra operands and
     // the input operand counts provided.
-    return 2 + NumIndirectDests + NumArgs + NumBundleInputs;
+    return unsigned(2 + NumIndirectDests + NumArgs + NumBundleInputs);
   }
 
 protected:
@@ -3764,10 +3802,11 @@ public:
                             ArrayRef<BasicBlock *> IndirectDests,
                             ArrayRef<Value *> Args, const Twine &NameStr,
                             InsertPosition InsertBefore = nullptr) {
-    int NumOperands = ComputeNumOperands(Args.size(), IndirectDests.size());
-    return new (NumOperands)
+    IntrusiveOperandsAllocMarker AllocMarker{
+        ComputeNumOperands(Args.size(), IndirectDests.size())};
+    return new (AllocMarker)
         CallBrInst(Ty, Func, DefaultDest, IndirectDests, Args, std::nullopt,
-                   NumOperands, NameStr, InsertBefore);
+                   AllocMarker, NameStr, InsertBefore);
   }
 
   static CallBrInst *
@@ -3775,13 +3814,14 @@ public:
          ArrayRef<BasicBlock *> IndirectDests, ArrayRef<Value *> Args,
          ArrayRef<OperandBundleDef> Bundles = std::nullopt,
          const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) {
-    int NumOperands = ComputeNumOperands(Args.size(), IndirectDests.size(),
-                                         CountBundleInputs(Bundles));
-    unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+    IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+        ComputeNumOperands(Args.size(), IndirectDests.size(),
+                           CountBundleInputs(Bundles)),
+        unsigned(Bundles.size() * sizeof(BundleOpInfo))};
 
-    return new (NumOperands, DescriptorBytes)
+    return new (AllocMarker)
         CallBrInst(Ty, Func, DefaultDest, IndirectDests, Args, Bundles,
-                   NumOperands, NameStr, InsertBefore);
+                   AllocMarker, NameStr, InsertBefore);
   }
 
   static CallBrInst *Create(FunctionCallee Func, BasicBlock *DefaultDest,
@@ -3881,10 +3921,9 @@ private:
 CallBrInst::CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest,
                        ArrayRef<BasicBlock *> IndirectDests,
                        ArrayRef<Value *> Args,
-                       ArrayRef<OperandBundleDef> Bundles, int NumOperands,
+                       ArrayRef<OperandBundleDef> Bundles, AllocInfo AllocInfo,
                        const Twine &NameStr, InsertPosition InsertBefore)
-    : CallBase(Ty->getReturnType(), Instruction::CallBr,
-               OperandTraits<CallBase>::op_end(this) - NumOperands, NumOperands,
+    : CallBase(Ty->getReturnType(), Instruction::CallBr, AllocInfo,
                InsertBefore) {
   init(Ty, Func, DefaultDest, IndirectDests, Args, Bundles, NameStr);
 }
@@ -3897,6 +3936,8 @@ CallBrInst::CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest,
 /// Resume the propagation of an exception.
 ///
 class ResumeInst : public Instruction {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
   ResumeInst(const ResumeInst &RI);
 
   explicit ResumeInst(Value *Exn, InsertPosition InsertBefore = nullptr);
@@ -3909,7 +3950,7 @@ protected:
 
 public:
   static ResumeInst *Create(Value *Exn, InsertPosition InsertBefore = nullptr) {
-    return new(1) ResumeInst(Exn, InsertBefore);
+    return new (AllocMarker) ResumeInst(Exn, InsertBefore);
   }
 
   /// Provide fast operand accessors
@@ -3951,6 +3992,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ResumeInst, Value)
 class CatchSwitchInst : public Instruction {
   using UnwindDestField = BoolBitfieldElementT<0>;
 
+  constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
   /// The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
   unsigned ReservedSpace;
@@ -3969,7 +4012,7 @@ class CatchSwitchInst : public Instruction {
                   InsertPosition InsertBefore);
 
   // allocate space for exactly zero operands
-  void *operator new(size_t S) { return User::operator new(S); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
 
   void init(Value *ParentPad, BasicBlock *UnwindDest, unsigned NumReserved);
   void growOperands(unsigned Size);
@@ -4114,9 +4157,9 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CatchSwitchInst, Value)
 class CleanupPadInst : public FuncletPadInst {
 private:
   explicit CleanupPadInst(Value *ParentPad, ArrayRef<Value *> Args,
-                          unsigned Values, const Twine &NameStr,
+                          AllocInfo AllocInfo, const Twine &NameStr,
                           InsertPosition InsertBefore)
-      : FuncletPadInst(Instruction::CleanupPad, ParentPad, Args, Values,
+      : FuncletPadInst(Instruction::CleanupPad, ParentPad, Args, AllocInfo,
                        NameStr, InsertBefore) {}
 
 public:
@@ -4124,9 +4167,9 @@ public:
                                 ArrayRef<Value *> Args = std::nullopt,
                                 const Twine &NameStr = "",
                                 InsertPosition InsertBefore = nullptr) {
-    unsigned Values = 1 + Args.size();
-    return new (Values)
-        CleanupPadInst(ParentPad, Args, Values, NameStr, InsertBefore);
+    IntrusiveOperandsAllocMarker AllocMarker{unsigned(1 + Args.size())};
+    return new (AllocMarker)
+        CleanupPadInst(ParentPad, Args, AllocMarker, NameStr, InsertBefore);
   }
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
@@ -4144,18 +4187,18 @@ public:
 class CatchPadInst : public FuncletPadInst {
 private:
   explicit CatchPadInst(Value *CatchSwitch, ArrayRef<Value *> Args,
-                        unsigned Values, const Twine &NameStr,
+                        AllocInfo AllocInfo, const Twine &NameStr,
                         InsertPosition InsertBefore)
-      : FuncletPadInst(Instruction::CatchPad, CatchSwitch, Args, Values,
+      : FuncletPadInst(Instruction::CatchPad, CatchSwitch, Args, AllocInfo,
                        NameStr, InsertBefore) {}
 
 public:
   static CatchPadInst *Create(Value *CatchSwitch, ArrayRef<Value *> Args,
                               const Twine &NameStr = "",
                               InsertPosition InsertBefore = nullptr) {
-    unsigned Values = 1 + Args.size();
-    return new (Values)
-        CatchPadInst(CatchSwitch, Args, Values, NameStr, InsertBefore);
+    IntrusiveOperandsAllocMarker AllocMarker{unsigned(1 + Args.size())};
+    return new (AllocMarker)
+        CatchPadInst(CatchSwitch, Args, AllocMarker, NameStr, InsertBefore);
   }
 
   /// Convenience accessors
@@ -4181,6 +4224,8 @@ public:
 //===----------------------------------------------------------------------===//
 
 class CatchReturnInst : public Instruction {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
   CatchReturnInst(const CatchReturnInst &RI);
   CatchReturnInst(Value *CatchPad, BasicBlock *BB, InsertPosition InsertBefore);
 
@@ -4197,7 +4242,7 @@ public:
                                  InsertPosition InsertBefore = nullptr) {
     assert(CatchPad);
     assert(BB);
-    return new (2) CatchReturnInst(CatchPad, BB, InsertBefore);
+    return new (AllocMarker) CatchReturnInst(CatchPad, BB, InsertBefore);
   }
 
   /// Provide fast operand accessors
@@ -4257,9 +4302,9 @@ class CleanupReturnInst : public Instruction {
   using UnwindDestField = BoolBitfieldElementT<0>;
 
 private:
-  CleanupReturnInst(const CleanupReturnInst &RI);
-  CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, unsigned Values,
-                    InsertPosition InsertBefore = nullptr);
+  CleanupReturnInst(const CleanupReturnInst &RI, AllocInfo AllocInfo);
+  CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
+                    AllocInfo AllocInfo, InsertPosition InsertBefore = nullptr);
 
   void init(Value *CleanupPad, BasicBlock *UnwindBB);
 
@@ -4277,8 +4322,9 @@ public:
     unsigned Values = 1;
     if (UnwindBB)
       ++Values;
-    return new (Values)
-        CleanupReturnInst(CleanupPad, UnwindBB, Values, InsertBefore);
+    IntrusiveOperandsAllocMarker AllocMarker{Values};
+    return new (AllocMarker)
+        CleanupReturnInst(CleanupPad, UnwindBB, AllocMarker, InsertBefore);
   }
 
   /// Provide fast operand accessors
@@ -4350,6 +4396,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CleanupReturnInst, Value)
 /// end of the block cannot be reached.
 ///
 class UnreachableInst : public Instruction {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{0};
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
@@ -4361,7 +4409,7 @@ public:
                            InsertPosition InsertBefore = nullptr);
 
   // allocate space for exactly zero operands
-  void *operator new(size_t S) { return User::operator new(S, 0); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   unsigned getNumSuccessors() const { return 0; }
diff --git a/llvm/include/llvm/IR/User.h b/llvm/include/llvm/IR/User.h
index 910815f..39e1314 100644
--- a/llvm/include/llvm/IR/User.h
+++ b/llvm/include/llvm/IR/User.h
@@ -43,39 +43,86 @@ struct OperandTraits;
 
 class User : public Value {
   friend struct HungoffOperandTraits;
+  template <class ConstantClass> friend struct ConstantAggrKeyType;
 
   LLVM_ATTRIBUTE_ALWAYS_INLINE static void *
   allocateFixedOperandUser(size_t, unsigned, unsigned);
 
 protected:
+  // Disable the default operator new, as all subclasses must use one of the
+  // custom operators below depending on how they store their operands.
+  void *operator new(size_t Size) = delete;
+
+  /// Indicates this User has operands "hung off" in another allocation.
+  struct HungOffOperandsAllocMarker {};
+
+  /// Indicates this User has operands co-allocated.
+  struct IntrusiveOperandsAllocMarker {
+    /// The number of operands for this User.
+    const unsigned NumOps;
+  };
+
+  /// Indicates this User has operands and a descriptor co-allocated .
+  struct IntrusiveOperandsAndDescriptorAllocMarker {
+    /// The number of operands for this User.
+    const unsigned NumOps;
+    /// The number of bytes to allocate for the descriptor. Must be divisible by
+    /// `sizeof(void *)`.
+    const unsigned DescBytes;
+  };
+
+  /// Information about how a User object was allocated, to be passed into the
+  /// User constructor.
+  ///
+  /// DO NOT USE DIRECTLY. Use one of the `AllocMarker` structs instead, they
+  /// call all be implicitly converted to `AllocInfo`.
+  struct AllocInfo {
+  public:
+    const unsigned NumOps : NumUserOperandsBits;
+    const bool HasHungOffUses : 1;
+    const bool HasDescriptor : 1;
+
+    AllocInfo() = delete;
+
+    constexpr AllocInfo(const HungOffOperandsAllocMarker)
+        : NumOps(0), HasHungOffUses(true), HasDescriptor(false) {}
+
+    constexpr AllocInfo(const IntrusiveOperandsAllocMarker Alloc)
+        : NumOps(Alloc.NumOps), HasHungOffUses(false), HasDescriptor(false) {}
+
+    constexpr AllocInfo(const IntrusiveOperandsAndDescriptorAllocMarker Alloc)
+        : NumOps(Alloc.NumOps), HasHungOffUses(false),
+          HasDescriptor(Alloc.DescBytes != 0) {}
+  };
+
   /// Allocate a User with an operand pointer co-allocated.
   ///
   /// This is used for subclasses which need to allocate a variable number
   /// of operands, ie, 'hung off uses'.
-  void *operator new(size_t Size);
+  void *operator new(size_t Size, HungOffOperandsAllocMarker);
 
   /// Allocate a User with the operands co-allocated.
   ///
   /// This is used for subclasses which have a fixed number of operands.
-  void *operator new(size_t Size, unsigned Us);
+  void *operator new(size_t Size, IntrusiveOperandsAllocMarker allocTrait);
 
   /// Allocate a User with the operands co-allocated.  If DescBytes is non-zero
   /// then allocate an additional DescBytes bytes before the operands. These
   /// bytes can be accessed by calling getDescriptor.
-  ///
-  /// DescBytes needs to be divisible by sizeof(void *).  The allocated
-  /// descriptor, if any, is aligned to sizeof(void *) bytes.
-  ///
-  /// This is used for subclasses which have a fixed number of operands.
-  void *operator new(size_t Size, unsigned Us, unsigned DescBytes);
-
-  User(Type *ty, unsigned vty, Use *, unsigned NumOps)
-      : Value(ty, vty) {
-    assert(NumOps < (1u << NumUserOperandsBits) && "Too many operands");
-    NumUserOperands = NumOps;
+  void *operator new(size_t Size,
+                     IntrusiveOperandsAndDescriptorAllocMarker allocTrait);
+
+  User(Type *ty, unsigned vty, AllocInfo AllocInfo) : Value(ty, vty) {
+    assert(AllocInfo.NumOps < (1u << NumUserOperandsBits) &&
+           "Too many operands");
+    NumUserOperands = AllocInfo.NumOps;
+    assert((!AllocInfo.HasDescriptor || !AllocInfo.HasHungOffUses) &&
+           "Cannot have both hung off uses and a descriptor");
+    HasHungOffUses = AllocInfo.HasHungOffUses;
+    HasDescriptor = AllocInfo.HasDescriptor;
     // If we have hung off uses, then the operand list should initially be
     // null.
-    assert((!HasHungOffUses || !getOperandList()) &&
+    assert((!AllocInfo.HasHungOffUses || !getOperandList()) &&
            "Error in initializing hung off uses for User");
   }
 
@@ -98,7 +145,20 @@ public:
   /// Free memory allocated for User and Use objects.
   void operator delete(void *Usr);
   /// Placement delete - required by std, called if the ctor throws.
-  void operator delete(void *Usr, unsigned) {
+  void operator delete(void *Usr, HungOffOperandsAllocMarker) {
+    // Note: If a subclass manipulates the information which is required to
+    // calculate the Usr memory pointer, e.g. NumUserOperands, the operator
+    // delete of that subclass has to restore the changed information to the
+    // original value, since the dtor of that class is not called if the ctor
+    // fails.
+    User::operator delete(Usr);
+
+#ifndef LLVM_ENABLE_EXCEPTIONS
+    llvm_unreachable("Constructor throws?");
+#endif
+  }
+  /// Placement delete - required by std, called if the ctor throws.
+  void operator delete(void *Usr, IntrusiveOperandsAllocMarker) {
     // Note: If a subclass manipulates the information which is required to calculate the
     // Usr memory pointer, e.g. NumUserOperands, the operator delete of that subclass has
     // to restore the changed information to the original value, since the dtor of that class
@@ -110,7 +170,7 @@ public:
 #endif
   }
   /// Placement delete - required by std, called if the ctor throws.
-  void operator delete(void *Usr, unsigned, unsigned) {
+  void operator delete(void *Usr, IntrusiveOperandsAndDescriptorAllocMarker) {
     // Note: If a subclass manipulates the information which is required to calculate the
     // Usr memory pointer, e.g. NumUserOperands, the operator delete of that subclass has
     // to restore the changed information to the original value, since the dtor of that class
@@ -195,19 +255,6 @@ public:
   /// Returns the descriptor co-allocated with this User instance.
   MutableArrayRef<uint8_t> getDescriptor();
 
-  /// Set the number of operands on a GlobalVariable.
-  ///
-  /// GlobalVariable always allocates space for a single operands, but
-  /// doesn't always use it.
-  ///
-  /// FIXME: As that the number of operands is used to find the start of
-  /// the allocated memory in operator delete, we need to always think we have
-  /// 1 operand before delete.
-  void setGlobalVariableNumOperands(unsigned NumOps) {
-    assert(NumOps <= 1 && "GlobalVariable can only have 0 or 1 operands");
-    NumUserOperands = NumOps;
-  }
-
   /// Subclasses with hung off uses need to manage the operand count
   /// themselves.  In these instances, the operand count isn't used to find the
   /// OperandList, so there's no issue in having the operand count change.
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 2fdbbbd..8888468 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -1506,6 +1506,10 @@ public:
   static Value *create(Value *Cond, Value *True, Value *False,
                        BasicBlock *InsertAtEnd, Context &Ctx,
                        const Twine &Name = "");
+
+  const Value *getCondition() const { return getOperand(0); }
+  const Value *getTrueValue() const { return getOperand(1); }
+  const Value *getFalseValue() const { return getOperand(2); }
   Value *getCondition() { return getOperand(0); }
   Value *getTrueValue() { return getOperand(1); }
   Value *getFalseValue() { return getOperand(2); }
@@ -1513,7 +1517,16 @@ public:
   void setCondition(Value *New) { setOperand(0, New); }
   void setTrueValue(Value *New) { setOperand(1, New); }
   void setFalseValue(Value *New) { setOperand(2, New); }
-  void swapValues() { cast<llvm::SelectInst>(Val)->swapValues(); }
+  void swapValues();
+
+  /// Return a string if the specified operands are invalid for a select
+  /// operation, otherwise return null.
+  static const char *areInvalidOperands(Value *Cond, Value *True,
+                                        Value *False) {
+    return llvm::SelectInst::areInvalidOperands(Cond->Val, True->Val,
+                                                False->Val);
+  }
+
   /// For isa/dyn_cast.
   static bool classof(const Value *From);
 };
diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 14a32cc..66fdcb4 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -755,7 +755,7 @@ template <class X, class Y> auto dyn_cast_if_present(Y *Val) {
 
 // Forwards to dyn_cast_if_present to avoid breaking current users. This is
 // deprecated and will be removed in a future patch, use
-// cast_if_present instead.
+// dyn_cast_if_present instead.
 template <class X, class Y> auto dyn_cast_or_null(const Y &Val) {
   return dyn_cast_if_present<X>(Val);
 }
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index e32a54f..6d035d5 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -1267,9 +1267,9 @@ static Constant *getSequenceIfElementsMatch(Constant *C,
 }
 
 ConstantAggregate::ConstantAggregate(Type *T, ValueTy VT,
-                                     ArrayRef<Constant *> V)
-    : Constant(T, VT, OperandTraits<ConstantAggregate>::op_end(this) - V.size(),
-               V.size()) {
+                                     ArrayRef<Constant *> V,
+                                     AllocInfo AllocInfo)
+    : Constant(T, VT, AllocInfo) {
   llvm::copy(V, op_begin());
 
   // Check that types match, unless this is an opaque struct.
@@ -1282,8 +1282,9 @@ ConstantAggregate::ConstantAggregate(Type *T, ValueTy VT,
   }
 }
 
-ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V)
-    : ConstantAggregate(T, ConstantArrayVal, V) {
+ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V,
+                             AllocInfo AllocInfo)
+    : ConstantAggregate(T, ConstantArrayVal, V, AllocInfo) {
   assert(V.size() == T->getNumElements() &&
          "Invalid initializer for constant array");
 }
@@ -1346,8 +1347,9 @@ StructType *ConstantStruct::getTypeForElements(ArrayRef<Constant*> V,
   return getTypeForElements(V[0]->getContext(), V, Packed);
 }
 
-ConstantStruct::ConstantStruct(StructType *T, ArrayRef<Constant *> V)
-    : ConstantAggregate(T, ConstantStructVal, V) {
+ConstantStruct::ConstantStruct(StructType *T, ArrayRef<Constant *> V,
+                               AllocInfo AllocInfo)
+    : ConstantAggregate(T, ConstantStructVal, V, AllocInfo) {
   assert((T->isOpaque() || V.size() == T->getNumElements()) &&
          "Invalid initializer for constant struct");
 }
@@ -1388,8 +1390,9 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
   return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V);
 }
 
-ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
-    : ConstantAggregate(T, ConstantVectorVal, V) {
+ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V,
+                               AllocInfo AllocInfo)
+    : ConstantAggregate(T, ConstantVectorVal, V, AllocInfo) {
   assert(V.size() == cast<FixedVectorType>(T)->getNumElements() &&
          "Invalid initializer for constant vector");
 }
@@ -1879,7 +1882,7 @@ BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) {
 
 BlockAddress::BlockAddress(Function *F, BasicBlock *BB)
     : Constant(PointerType::get(F->getContext(), F->getAddressSpace()),
-               Value::BlockAddressVal, &Op<0>(), 2) {
+               Value::BlockAddressVal, AllocMarker) {
   setOperand(0, F);
   setOperand(1, BB);
   BB->AdjustBlockAddressRefCount(1);
@@ -1951,7 +1954,7 @@ DSOLocalEquivalent *DSOLocalEquivalent::get(GlobalValue *GV) {
 }
 
 DSOLocalEquivalent::DSOLocalEquivalent(GlobalValue *GV)
-    : Constant(GV->getType(), Value::DSOLocalEquivalentVal, &Op<0>(), 1) {
+    : Constant(GV->getType(), Value::DSOLocalEquivalentVal, AllocMarker) {
   setOperand(0, GV);
 }
 
@@ -2009,7 +2012,7 @@ NoCFIValue *NoCFIValue::get(GlobalValue *GV) {
 }
 
 NoCFIValue::NoCFIValue(GlobalValue *GV)
-    : Constant(GV->getType(), Value::NoCFIValueVal, &Op<0>(), 1) {
+    : Constant(GV->getType(), Value::NoCFIValueVal, AllocMarker) {
   setOperand(0, GV);
 }
 
@@ -2056,7 +2059,7 @@ ConstantPtrAuth *ConstantPtrAuth::getWithSameSchema(Constant *Pointer) const {
 
 ConstantPtrAuth::ConstantPtrAuth(Constant *Ptr, ConstantInt *Key,
                                  ConstantInt *Disc, Constant *AddrDisc)
-    : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, &Op<0>(), 4) {
+    : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, AllocMarker) {
   assert(Ptr->getType()->isPointerTy());
   assert(Key->getBitWidth() == 32);
   assert(Disc->getBitWidth() == 64);
@@ -2758,11 +2761,8 @@ const char *ConstantExpr::getOpcodeName() const {
 
 GetElementPtrConstantExpr::GetElementPtrConstantExpr(
     Type *SrcElementTy, Constant *C, ArrayRef<Constant *> IdxList, Type *DestTy,
-    std::optional<ConstantRange> InRange)
-    : ConstantExpr(DestTy, Instruction::GetElementPtr,
-                   OperandTraits<GetElementPtrConstantExpr>::op_end(this) -
-                       (IdxList.size() + 1),
-                   IdxList.size() + 1),
+    std::optional<ConstantRange> InRange, AllocInfo AllocInfo)
+    : ConstantExpr(DestTy, Instruction::GetElementPtr, AllocInfo),
       SrcElementTy(SrcElementTy),
       ResElementTy(GetElementPtrInst::getIndexedType(SrcElementTy, IdxList)),
       InRange(std::move(InRange)) {
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index bd19ec6..6afc86f 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -44,14 +44,16 @@ namespace llvm {
 /// CastConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement cast constant exprs.
 class CastConstantExpr final : public ConstantExpr {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
 public:
   CastConstantExpr(unsigned Opcode, Constant *C, Type *Ty)
-    : ConstantExpr(Ty, Opcode, &Op<0>(), 1) {
+      : ConstantExpr(Ty, Opcode, AllocMarker) {
     Op<0>() = C;
   }
 
   // allocate space for exactly one operand
-  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -67,17 +69,19 @@ public:
 /// BinaryConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement binary constant exprs.
 class BinaryConstantExpr final : public ConstantExpr {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
 public:
   BinaryConstantExpr(unsigned Opcode, Constant *C1, Constant *C2,
                      unsigned Flags)
-    : ConstantExpr(C1->getType(), Opcode, &Op<0>(), 2) {
+      : ConstantExpr(C1->getType(), Opcode, AllocMarker) {
     Op<0>() = C1;
     Op<1>() = C2;
     SubclassOptionalData = Flags;
   }
 
   // allocate space for exactly two operands
-  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
@@ -95,16 +99,18 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// extractelement constant exprs.
 class ExtractElementConstantExpr final : public ConstantExpr {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
 public:
   ExtractElementConstantExpr(Constant *C1, Constant *C2)
-    : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(),
-                   Instruction::ExtractElement, &Op<0>(), 2) {
+      : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(),
+                     Instruction::ExtractElement, AllocMarker) {
     Op<0>() = C1;
     Op<1>() = C2;
   }
 
   // allocate space for exactly two operands
-  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
@@ -122,17 +128,18 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// insertelement constant exprs.
 class InsertElementConstantExpr final : public ConstantExpr {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{3};
+
 public:
   InsertElementConstantExpr(Constant *C1, Constant *C2, Constant *C3)
-    : ConstantExpr(C1->getType(), Instruction::InsertElement,
-                   &Op<0>(), 3) {
+      : ConstantExpr(C1->getType(), Instruction::InsertElement, AllocMarker) {
     Op<0>() = C1;
     Op<1>() = C2;
     Op<2>() = C3;
   }
 
   // allocate space for exactly three operands
-  void *operator new(size_t S) { return User::operator new(S, 3); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
@@ -150,12 +157,14 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// shufflevector constant exprs.
 class ShuffleVectorConstantExpr final : public ConstantExpr {
+  constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
 public:
   ShuffleVectorConstantExpr(Constant *C1, Constant *C2, ArrayRef<int> Mask)
       : ConstantExpr(VectorType::get(
                          cast<VectorType>(C1->getType())->getElementType(),
                          Mask.size(), isa<ScalableVectorType>(C1->getType())),
-                     Instruction::ShuffleVector, &Op<0>(), 2) {
+                     Instruction::ShuffleVector, AllocMarker) {
     assert(ShuffleVectorInst::isValidOperands(C1, C2, Mask) &&
            "Invalid shuffle vector instruction operands!");
     Op<0>() = C1;
@@ -168,7 +177,7 @@ public:
   SmallVector<int, 4> ShuffleMask;
   Constant *ShuffleMaskForBitcode;
 
-  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
   void operator delete(void *Ptr) { return User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
@@ -191,15 +200,17 @@ class GetElementPtrConstantExpr : public ConstantExpr {
 
   GetElementPtrConstantExpr(Type *SrcElementTy, Constant *C,
                             ArrayRef<Constant *> IdxList, Type *DestTy,
-                            std::optional<ConstantRange> InRange);
+                            std::optional<ConstantRange> InRange,
+                            AllocInfo AllocInfo);
 
 public:
   static GetElementPtrConstantExpr *
   Create(Type *SrcElementTy, Constant *C, ArrayRef<Constant *> IdxList,
          Type *DestTy, unsigned Flags, std::optional<ConstantRange> InRange) {
-    GetElementPtrConstantExpr *Result = new (IdxList.size() + 1)
+    IntrusiveOperandsAllocMarker AllocMarker{unsigned(IdxList.size() + 1)};
+    GetElementPtrConstantExpr *Result = new (AllocMarker)
         GetElementPtrConstantExpr(SrcElementTy, C, IdxList, DestTy,
-                                  std::move(InRange));
+                                  std::move(InRange), AllocMarker);
     Result->SubclassOptionalData = Flags;
     return Result;
   }
@@ -318,7 +329,8 @@ template <class ConstantClass> struct ConstantAggrKeyType {
   using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
 
   ConstantClass *create(TypeClass *Ty) const {
-    return new (Operands.size()) ConstantClass(Ty, Operands);
+    User::IntrusiveOperandsAllocMarker AllocMarker{unsigned(Operands.size())};
+    return new (AllocMarker) ConstantClass(Ty, Operands, AllocMarker);
   }
 };
 
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index afef893..82ff4e1 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -402,7 +402,7 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty,
                                           LinkageTypes Linkage,
                                           unsigned AddrSpace, const Twine &N,
                                           Module *M) {
-  auto *F = new Function(Ty, Linkage, AddrSpace, N, M);
+  auto *F = new (AllocMarker) Function(Ty, Linkage, AddrSpace, N, M);
   AttrBuilder B(F->getContext());
   UWTableKind UWTable = M->getUwtable();
   if (UWTable != UWTableKind::None)
@@ -501,8 +501,7 @@ static unsigned computeAddrSpace(unsigned AddrSpace, Module *M) {
 
 Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
                    const Twine &name, Module *ParentModule)
-    : GlobalObject(Ty, Value::FunctionVal,
-                   OperandTraits<Function>::op_begin(this), 0, Linkage, name,
+    : GlobalObject(Ty, Value::FunctionVal, AllocMarker, Linkage, name,
                    computeAddrSpace(AddrSpace, ParentModule)),
       NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(UseNewDbgInfoFormat) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 2bc69cd..99f4fa5 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -442,9 +442,8 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
                                Constant *InitVal, const Twine &Name,
                                ThreadLocalMode TLMode, unsigned AddressSpace,
                                bool isExternallyInitialized)
-    : GlobalObject(Ty, Value::GlobalVariableVal,
-                   OperandTraits<GlobalVariable>::op_begin(this),
-                   InitVal != nullptr, Link, Name, AddressSpace),
+    : GlobalObject(Ty, Value::GlobalVariableVal, AllocMarker, Link, Name,
+                   AddressSpace),
       isConstantGlobal(constant),
       isExternallyInitializedConstant(isExternallyInitialized) {
   assert(!Ty->isFunctionTy() && PointerType::isValidElementType(Ty) &&
@@ -454,6 +453,8 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
     Op<0>() = InitVal;
+  } else {
+    setGlobalVariableNumOperands(0);
   }
 }
 
@@ -540,7 +541,7 @@ void GlobalVariable::setCodeModel(CodeModel::Model CM) {
 GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
                          const Twine &Name, Constant *Aliasee,
                          Module *ParentModule)
-    : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name,
+    : GlobalValue(Ty, Value::GlobalAliasVal, AllocMarker, Link, Name,
                   AddressSpace) {
   setAliasee(Aliasee);
   if (ParentModule)
@@ -597,7 +598,7 @@ const GlobalObject *GlobalAlias::getAliaseeObject() const {
 GlobalIFunc::GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
                          const Twine &Name, Constant *Resolver,
                          Module *ParentModule)
-    : GlobalObject(Ty, Value::GlobalIFuncVal, &Op<0>(), 1, Link, Name,
+    : GlobalObject(Ty, Value::GlobalIFuncVal, AllocMarker, Link, Name,
                    AddressSpace) {
   setResolver(Resolver);
   if (ParentModule)
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 62d88ce..b1c2b02 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -32,9 +32,9 @@ InsertPosition::InsertPosition(Instruction *InsertBefore)
 InsertPosition::InsertPosition(BasicBlock *InsertAtEnd)
     : InsertAt(InsertAtEnd ? InsertAtEnd->end() : InstListType::iterator()) {}
 
-Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
+Instruction::Instruction(Type *ty, unsigned it, AllocInfo AllocInfo,
                          InsertPosition InsertBefore)
-    : User(ty, Value::InstructionVal + it, Ops, NumOps) {
+    : User(ty, Value::InstructionVal + it, AllocInfo) {
   // When called with an iterator, there must be a block to insert into.
   if (InstListType::iterator InsertIt = InsertBefore; InsertIt.isValid()) {
     BasicBlock *BB = InsertIt.getNodeParent();
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 19da1f6..e95b98a 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -121,8 +121,9 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
 //===----------------------------------------------------------------------===//
 
 PHINode::PHINode(const PHINode &PN)
-    : Instruction(PN.getType(), Instruction::PHI, nullptr, PN.getNumOperands()),
+    : Instruction(PN.getType(), Instruction::PHI, AllocMarker),
       ReservedSpace(PN.getNumOperands()) {
+  NumUserOperands = PN.getNumOperands();
   allocHungoffUses(PN.getNumOperands());
   std::copy(PN.op_begin(), PN.op_end(), op_begin());
   copyIncomingBlocks(make_range(PN.block_begin(), PN.block_end()));
@@ -243,14 +244,14 @@ bool PHINode::hasConstantOrUndefValue() const {
 LandingPadInst::LandingPadInst(Type *RetTy, unsigned NumReservedValues,
                                const Twine &NameStr,
                                InsertPosition InsertBefore)
-    : Instruction(RetTy, Instruction::LandingPad, nullptr, 0, InsertBefore) {
+    : Instruction(RetTy, Instruction::LandingPad, AllocMarker, InsertBefore) {
   init(NumReservedValues, NameStr);
 }
 
 LandingPadInst::LandingPadInst(const LandingPadInst &LP)
-    : Instruction(LP.getType(), Instruction::LandingPad, nullptr,
-                  LP.getNumOperands()),
+    : Instruction(LP.getType(), Instruction::LandingPad, AllocMarker),
       ReservedSpace(LP.getNumOperands()) {
+  NumUserOperands = LP.getNumOperands();
   allocHungoffUses(LP.getNumOperands());
   Use *OL = getOperandList();
   const Use *InOL = LP.getOperandList();
@@ -716,16 +717,16 @@ void CallInst::init(FunctionType *FTy, Value *Func, const Twine &NameStr) {
 }
 
 CallInst::CallInst(FunctionType *Ty, Value *Func, const Twine &Name,
-                   InsertPosition InsertBefore)
-    : CallBase(Ty->getReturnType(), Instruction::Call,
-               OperandTraits<CallBase>::op_end(this) - 1, 1, InsertBefore) {
+                   AllocInfo AllocInfo, InsertPosition InsertBefore)
+    : CallBase(Ty->getReturnType(), Instruction::Call, AllocInfo,
+               InsertBefore) {
   init(Ty, Func, Name);
 }
 
-CallInst::CallInst(const CallInst &CI)
-    : CallBase(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call,
-               OperandTraits<CallBase>::op_end(this) - CI.getNumOperands(),
-               CI.getNumOperands()) {
+CallInst::CallInst(const CallInst &CI, AllocInfo AllocInfo)
+    : CallBase(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call, AllocInfo) {
+  assert(getNumOperands() == CI.getNumOperands() &&
+         "Wrong number of operands allocated");
   setTailCallKind(CI.getTailCallKind());
   setCallingConv(CI.getCallingConv());
 
@@ -774,7 +775,7 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal,
                       const Twine &NameStr) {
   this->FTy = FTy;
 
-  assert((int)getNumOperands() ==
+  assert(getNumOperands() ==
              ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)) &&
          "NumOperands not set up?");
 
@@ -803,10 +804,10 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal,
   setName(NameStr);
 }
 
-InvokeInst::InvokeInst(const InvokeInst &II)
-    : CallBase(II.Attrs, II.FTy, II.getType(), Instruction::Invoke,
-               OperandTraits<CallBase>::op_end(this) - II.getNumOperands(),
-               II.getNumOperands()) {
+InvokeInst::InvokeInst(const InvokeInst &II, AllocInfo AllocInfo)
+    : CallBase(II.Attrs, II.FTy, II.getType(), Instruction::Invoke, AllocInfo) {
+  assert(getNumOperands() == II.getNumOperands() &&
+         "Wrong number of operands allocated");
   setCallingConv(II.getCallingConv());
   std::copy(II.op_begin(), II.op_end(), op_begin());
   std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(),
@@ -855,9 +856,9 @@ void CallBrInst::init(FunctionType *FTy, Value *Fn, BasicBlock *Fallthrough,
                       const Twine &NameStr) {
   this->FTy = FTy;
 
-  assert((int)getNumOperands() ==
-             ComputeNumOperands(Args.size(), IndirectDests.size(),
-                                CountBundleInputs(Bundles)) &&
+  assert(getNumOperands() == ComputeNumOperands(Args.size(),
+                                                IndirectDests.size(),
+                                                CountBundleInputs(Bundles)) &&
          "NumOperands not set up?");
 
 #ifndef NDEBUG
@@ -887,10 +888,11 @@ void CallBrInst::init(FunctionType *FTy, Value *Fn, BasicBlock *Fallthrough,
   setName(NameStr);
 }
 
-CallBrInst::CallBrInst(const CallBrInst &CBI)
+CallBrInst::CallBrInst(const CallBrInst &CBI, AllocInfo AllocInfo)
     : CallBase(CBI.Attrs, CBI.FTy, CBI.getType(), Instruction::CallBr,
-               OperandTraits<CallBase>::op_end(this) - CBI.getNumOperands(),
-               CBI.getNumOperands()) {
+               AllocInfo) {
+  assert(getNumOperands() == CBI.getNumOperands() &&
+         "Wrong number of operands allocated");
   setCallingConv(CBI.getCallingConv());
   std::copy(CBI.op_begin(), CBI.op_end(), op_begin());
   std::copy(CBI.bundle_op_info_begin(), CBI.bundle_op_info_end(),
@@ -918,19 +920,19 @@ CallBrInst *CallBrInst::Create(CallBrInst *CBI, ArrayRef<OperandBundleDef> OpB,
 //                        ReturnInst Implementation
 //===----------------------------------------------------------------------===//
 
-ReturnInst::ReturnInst(const ReturnInst &RI)
+ReturnInst::ReturnInst(const ReturnInst &RI, AllocInfo AllocInfo)
     : Instruction(Type::getVoidTy(RI.getContext()), Instruction::Ret,
-                  OperandTraits<ReturnInst>::op_end(this) - RI.getNumOperands(),
-                  RI.getNumOperands()) {
+                  AllocInfo) {
+  assert(getNumOperands() == RI.getNumOperands() &&
+         "Wrong number of operands allocated");
   if (RI.getNumOperands())
     Op<0>() = RI.Op<0>();
   SubclassOptionalData = RI.SubclassOptionalData;
 }
 
-ReturnInst::ReturnInst(LLVMContext &C, Value *retVal,
+ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, AllocInfo AllocInfo,
                        InsertPosition InsertBefore)
-    : Instruction(Type::getVoidTy(C), Instruction::Ret,
-                  OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
+    : Instruction(Type::getVoidTy(C), Instruction::Ret, AllocInfo,
                   InsertBefore) {
   if (retVal)
     Op<0>() = retVal;
@@ -942,13 +944,13 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal,
 
 ResumeInst::ResumeInst(const ResumeInst &RI)
     : Instruction(Type::getVoidTy(RI.getContext()), Instruction::Resume,
-                  OperandTraits<ResumeInst>::op_begin(this), 1) {
+                  AllocMarker) {
   Op<0>() = RI.Op<0>();
 }
 
 ResumeInst::ResumeInst(Value *Exn, InsertPosition InsertBefore)
     : Instruction(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
-                  OperandTraits<ResumeInst>::op_begin(this), 1, InsertBefore) {
+                  AllocMarker, InsertBefore) {
   Op<0>() = Exn;
 }
 
@@ -956,11 +958,11 @@ ResumeInst::ResumeInst(Value *Exn, InsertPosition InsertBefore)
 //                        CleanupReturnInst Implementation
 //===----------------------------------------------------------------------===//
 
-CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI)
-    : Instruction(CRI.getType(), Instruction::CleanupRet,
-                  OperandTraits<CleanupReturnInst>::op_end(this) -
-                      CRI.getNumOperands(),
-                  CRI.getNumOperands()) {
+CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI,
+                                     AllocInfo AllocInfo)
+    : Instruction(CRI.getType(), Instruction::CleanupRet, AllocInfo) {
+  assert(getNumOperands() == CRI.getNumOperands() &&
+         "Wrong number of operands allocated");
   setSubclassData<Instruction::OpaqueField>(
       CRI.getSubclassData<Instruction::OpaqueField>());
   Op<0>() = CRI.Op<0>();
@@ -978,12 +980,10 @@ void CleanupReturnInst::init(Value *CleanupPad, BasicBlock *UnwindBB) {
 }
 
 CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
-                                     unsigned Values,
+                                     AllocInfo AllocInfo,
                                      InsertPosition InsertBefore)
     : Instruction(Type::getVoidTy(CleanupPad->getContext()),
-                  Instruction::CleanupRet,
-                  OperandTraits<CleanupReturnInst>::op_end(this) - Values,
-                  Values, InsertBefore) {
+                  Instruction::CleanupRet, AllocInfo, InsertBefore) {
   init(CleanupPad, UnwindBB);
 }
 
@@ -997,7 +997,7 @@ void CatchReturnInst::init(Value *CatchPad, BasicBlock *BB) {
 
 CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI)
     : Instruction(Type::getVoidTy(CRI.getContext()), Instruction::CatchRet,
-                  OperandTraits<CatchReturnInst>::op_begin(this), 2) {
+                  AllocMarker) {
   Op<0>() = CRI.Op<0>();
   Op<1>() = CRI.Op<1>();
 }
@@ -1005,8 +1005,7 @@ CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI)
 CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
                                  InsertPosition InsertBefore)
     : Instruction(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
-                  OperandTraits<CatchReturnInst>::op_begin(this), 2,
-                  InsertBefore) {
+                  AllocMarker, InsertBefore) {
   init(CatchPad, BB);
 }
 
@@ -1018,7 +1017,7 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
                                  unsigned NumReservedValues,
                                  const Twine &NameStr,
                                  InsertPosition InsertBefore)
-    : Instruction(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
+    : Instruction(ParentPad->getType(), Instruction::CatchSwitch, AllocMarker,
                   InsertBefore) {
   if (UnwindDest)
     ++NumReservedValues;
@@ -1027,8 +1026,8 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
 }
 
 CatchSwitchInst::CatchSwitchInst(const CatchSwitchInst &CSI)
-    : Instruction(CSI.getType(), Instruction::CatchSwitch, nullptr,
-                  CSI.getNumOperands()) {
+    : Instruction(CSI.getType(), Instruction::CatchSwitch, AllocMarker) {
+  NumUserOperands = CSI.NumUserOperands;
   init(CSI.getParentPad(), CSI.getUnwindDest(), CSI.getNumOperands());
   setNumHungOffUseOperands(ReservedSpace);
   Use *OL = getOperandList();
@@ -1093,22 +1092,19 @@ void FuncletPadInst::init(Value *ParentPad, ArrayRef<Value *> Args,
   setName(NameStr);
 }
 
-FuncletPadInst::FuncletPadInst(const FuncletPadInst &FPI)
-    : Instruction(FPI.getType(), FPI.getOpcode(),
-                  OperandTraits<FuncletPadInst>::op_end(this) -
-                      FPI.getNumOperands(),
-                  FPI.getNumOperands()) {
+FuncletPadInst::FuncletPadInst(const FuncletPadInst &FPI, AllocInfo AllocInfo)
+    : Instruction(FPI.getType(), FPI.getOpcode(), AllocInfo) {
+  assert(getNumOperands() == FPI.getNumOperands() &&
+         "Wrong number of operands allocated");
   std::copy(FPI.op_begin(), FPI.op_end(), op_begin());
   setParentPad(FPI.getParentPad());
 }
 
 FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
-                               ArrayRef<Value *> Args, unsigned Values,
+                               ArrayRef<Value *> Args, AllocInfo AllocInfo,
                                const Twine &NameStr,
                                InsertPosition InsertBefore)
-    : Instruction(ParentPad->getType(), Op,
-                  OperandTraits<FuncletPadInst>::op_end(this) - Values, Values,
-                  InsertBefore) {
+    : Instruction(ParentPad->getType(), Op, AllocInfo, InsertBefore) {
   init(ParentPad, Args, NameStr);
 }
 
@@ -1118,8 +1114,8 @@ FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
 
 UnreachableInst::UnreachableInst(LLVMContext &Context,
                                  InsertPosition InsertBefore)
-    : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr,
-                  0, InsertBefore) {}
+    : Instruction(Type::getVoidTy(Context), Instruction::Unreachable,
+                  AllocMarker, InsertBefore) {}
 
 //===----------------------------------------------------------------------===//
 //                        BranchInst Implementation
@@ -1131,19 +1127,18 @@ void BranchInst::AssertOK() {
            "May only branch on boolean predicates!");
 }
 
-BranchInst::BranchInst(BasicBlock *IfTrue, InsertPosition InsertBefore)
+BranchInst::BranchInst(BasicBlock *IfTrue, AllocInfo AllocInfo,
+                       InsertPosition InsertBefore)
     : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
-                  OperandTraits<BranchInst>::op_end(this) - 1, 1,
-                  InsertBefore) {
+                  AllocInfo, InsertBefore) {
   assert(IfTrue && "Branch destination may not be null!");
   Op<-1>() = IfTrue;
 }
 
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
-                       InsertPosition InsertBefore)
+                       AllocInfo AllocInfo, InsertPosition InsertBefore)
     : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
-                  OperandTraits<BranchInst>::op_end(this) - 3, 3,
-                  InsertBefore) {
+                  AllocInfo, InsertBefore) {
   // Assign in order of operand index to make use-list order predictable.
   Op<-3>() = Cond;
   Op<-2>() = IfFalse;
@@ -1153,10 +1148,11 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
 #endif
 }
 
-BranchInst::BranchInst(const BranchInst &BI)
+BranchInst::BranchInst(const BranchInst &BI, AllocInfo AllocInfo)
     : Instruction(Type::getVoidTy(BI.getContext()), Instruction::Br,
-                  OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
-                  BI.getNumOperands()) {
+                  AllocInfo) {
+  assert(getNumOperands() == BI.getNumOperands() &&
+         "Wrong number of operands allocated");
   // Assign in order of operand index to make use-list order predictable.
   if (BI.getNumOperands() != 1) {
     assert(BI.getNumOperands() == 3 && "BR can have 1 or 3 operands!");
@@ -1313,9 +1309,8 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align,
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align,
                      AtomicOrdering Order, SyncScope::ID SSID,
                      InsertPosition InsertBefore)
-    : Instruction(Type::getVoidTy(val->getContext()), Store,
-                  OperandTraits<StoreInst>::op_begin(this),
-                  OperandTraits<StoreInst>::operands(this), InsertBefore) {
+    : Instruction(Type::getVoidTy(val->getContext()), Store, AllocMarker,
+                  InsertBefore) {
   Op<0>() = val;
   Op<1>() = addr;
   setVolatile(isVolatile);
@@ -1356,8 +1351,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      InsertPosition InsertBefore)
     : Instruction(
           StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
-          AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
-          OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) {
+          AtomicCmpXchg, AllocMarker, InsertBefore) {
   Init(Ptr, Cmp, NewVal, Alignment, SuccessOrdering, FailureOrdering, SSID);
 }
 
@@ -1389,9 +1383,7 @@ void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
 AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
                              Align Alignment, AtomicOrdering Ordering,
                              SyncScope::ID SSID, InsertPosition InsertBefore)
-    : Instruction(Val->getType(), AtomicRMW,
-                  OperandTraits<AtomicRMWInst>::op_begin(this),
-                  OperandTraits<AtomicRMWInst>::operands(this), InsertBefore) {
+    : Instruction(Val->getType(), AtomicRMW, AllocMarker, InsertBefore) {
   Init(Operation, Ptr, Val, Alignment, Ordering, SSID);
 }
 
@@ -1448,7 +1440,7 @@ StringRef AtomicRMWInst::getOperationName(BinOp Op) {
 
 FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering,
                      SyncScope::ID SSID, InsertPosition InsertBefore)
-    : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertBefore) {
+    : Instruction(Type::getVoidTy(C), Fence, AllocMarker, InsertBefore) {
   setOrdering(Ordering);
   setSyncScopeID(SSID);
 }
@@ -1466,13 +1458,13 @@ void GetElementPtrInst::init(Value *Ptr, ArrayRef<Value *> IdxList,
   setName(Name);
 }
 
-GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI)
-    : Instruction(GEPI.getType(), GetElementPtr,
-                  OperandTraits<GetElementPtrInst>::op_end(this) -
-                      GEPI.getNumOperands(),
-                  GEPI.getNumOperands()),
+GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI,
+                                     AllocInfo AllocInfo)
+    : Instruction(GEPI.getType(), GetElementPtr, AllocInfo),
       SourceElementType(GEPI.SourceElementType),
       ResultElementType(GEPI.ResultElementType) {
+  assert(getNumOperands() == GEPI.getNumOperands() &&
+         "Wrong number of operands allocated");
   std::copy(GEPI.op_begin(), GEPI.op_end(), op_begin());
   SubclassOptionalData = GEPI.SubclassOptionalData;
 }
@@ -1606,9 +1598,8 @@ bool GetElementPtrInst::collectOffset(
 ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
                                        const Twine &Name,
                                        InsertPosition InsertBef)
-    : Instruction(
-          cast<VectorType>(Val->getType())->getElementType(), ExtractElement,
-          OperandTraits<ExtractElementInst>::op_begin(this), 2, InsertBef) {
+    : Instruction(cast<VectorType>(Val->getType())->getElementType(),
+                  ExtractElement, AllocMarker, InsertBef) {
   assert(isValidOperands(Val, Index) &&
          "Invalid extractelement instruction operands!");
   Op<0>() = Val;
@@ -1629,9 +1620,7 @@ bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) {
 InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, Value *Index,
                                      const Twine &Name,
                                      InsertPosition InsertBef)
-    : Instruction(Vec->getType(), InsertElement,
-                  OperandTraits<InsertElementInst>::op_begin(this), 3,
-                  InsertBef) {
+    : Instruction(Vec->getType(), InsertElement, AllocMarker, InsertBef) {
   assert(isValidOperands(Vec, Elt, Index) &&
          "Invalid insertelement instruction operands!");
   Op<0>() = Vec;
@@ -1679,8 +1668,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
     : Instruction(
           VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
                           cast<VectorType>(Mask->getType())->getElementCount()),
-          ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this),
-          OperandTraits<ShuffleVectorInst>::operands(this), InsertBefore) {
+          ShuffleVector, AllocMarker, InsertBefore) {
   assert(isValidOperands(V1, V2, Mask) &&
          "Invalid shuffle vector instruction operands!");
 
@@ -1698,8 +1686,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, ArrayRef<int> Mask,
     : Instruction(
           VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
                           Mask.size(), isa<ScalableVectorType>(V1->getType())),
-          ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this),
-          OperandTraits<ShuffleVectorInst>::operands(this), InsertBefore) {
+          ShuffleVector, AllocMarker, InsertBefore) {
   assert(isValidOperands(V1, V2, Mask) &&
          "Invalid shuffle vector instruction operands!");
   Op<0>() = V1;
@@ -2464,9 +2451,8 @@ void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
 }
 
 InsertValueInst::InsertValueInst(const InsertValueInst &IVI)
-  : Instruction(IVI.getType(), InsertValue,
-                OperandTraits<InsertValueInst>::op_begin(this), 2),
-    Indices(IVI.Indices) {
+    : Instruction(IVI.getType(), InsertValue, AllocMarker),
+      Indices(IVI.Indices) {
   Op<0>() = IVI.getOperand(0);
   Op<1>() = IVI.getOperand(1);
   SubclassOptionalData = IVI.SubclassOptionalData;
@@ -2565,8 +2551,7 @@ void UnaryOperator::AssertOK() {
 
 BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty,
                                const Twine &Name, InsertPosition InsertBefore)
-    : Instruction(Ty, iType, OperandTraits<BinaryOperator>::op_begin(this),
-                  OperandTraits<BinaryOperator>::operands(this), InsertBefore) {
+    : Instruction(Ty, iType, AllocMarker, InsertBefore) {
   Op<0>() = S1;
   Op<1>() = S2;
   setName(Name);
@@ -3427,8 +3412,7 @@ AddrSpaceCastInst::AddrSpaceCastInst(Value *S, Type *Ty, const Twine &Name,
 CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
                  Value *RHS, const Twine &Name, InsertPosition InsertBefore,
                  Instruction *FlagsSource)
-    : Instruction(ty, op, OperandTraits<CmpInst>::op_begin(this),
-                  OperandTraits<CmpInst>::operands(this), InsertBefore) {
+    : Instruction(ty, op, AllocMarker, InsertBefore) {
   Op<0>() = LHS;
   Op<1>() = RHS;
   setPredicate((Predicate)predicate);
@@ -3918,12 +3902,12 @@ void SwitchInst::init(Value *Value, BasicBlock *Default, unsigned NumReserved) {
 SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
                        InsertPosition InsertBefore)
     : Instruction(Type::getVoidTy(Value->getContext()), Instruction::Switch,
-                  nullptr, 0, InsertBefore) {
+                  AllocMarker, InsertBefore) {
   init(Value, Default, 2+NumCases*2);
 }
 
 SwitchInst::SwitchInst(const SwitchInst &SI)
-    : Instruction(SI.getType(), Instruction::Switch, nullptr, 0) {
+    : Instruction(SI.getType(), Instruction::Switch, AllocMarker) {
   init(SI.getCondition(), SI.getDefaultDest(), SI.getNumOperands());
   setNumHungOffUseOperands(SI.getNumOperands());
   Use *OL = getOperandList();
@@ -4125,13 +4109,14 @@ void IndirectBrInst::growOperands() {
 IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
                                InsertPosition InsertBefore)
     : Instruction(Type::getVoidTy(Address->getContext()),
-                  Instruction::IndirectBr, nullptr, 0, InsertBefore) {
+                  Instruction::IndirectBr, AllocMarker, InsertBefore) {
   init(Address, NumCases);
 }
 
 IndirectBrInst::IndirectBrInst(const IndirectBrInst &IBI)
     : Instruction(Type::getVoidTy(IBI.getContext()), Instruction::IndirectBr,
-                  nullptr, IBI.getNumOperands()) {
+                  AllocMarker) {
+  NumUserOperands = IBI.NumUserOperands;
   allocHungoffUses(IBI.getNumOperands());
   Use *OL = getOperandList();
   const Use *InOL = IBI.getOperandList();
@@ -4185,7 +4170,8 @@ FreezeInst::FreezeInst(Value *S, const Twine &Name, InsertPosition InsertBefore)
 // unit that uses these classes.
 
 GetElementPtrInst *GetElementPtrInst::cloneImpl() const {
-  return new (getNumOperands()) GetElementPtrInst(*this);
+  IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+  return new (AllocMarker) GetElementPtrInst(*this, AllocMarker);
 }
 
 UnaryOperator *UnaryOperator::cloneImpl() const {
@@ -4305,10 +4291,13 @@ AddrSpaceCastInst *AddrSpaceCastInst::cloneImpl() const {
 
 CallInst *CallInst::cloneImpl() const {
   if (hasOperandBundles()) {
-    unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo);
-    return new(getNumOperands(), DescriptorBytes) CallInst(*this);
+    IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+        getNumOperands(),
+        getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))};
+    return new (AllocMarker) CallInst(*this, AllocMarker);
   }
-  return  new(getNumOperands()) CallInst(*this);
+  IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+  return new (AllocMarker) CallInst(*this, AllocMarker);
 }
 
 SelectInst *SelectInst::cloneImpl() const {
@@ -4331,18 +4320,20 @@ ShuffleVectorInst *ShuffleVectorInst::cloneImpl() const {
   return new ShuffleVectorInst(getOperand(0), getOperand(1), getShuffleMask());
 }
 
-PHINode *PHINode::cloneImpl() const { return new PHINode(*this); }
+PHINode *PHINode::cloneImpl() const { return new (AllocMarker) PHINode(*this); }
 
 LandingPadInst *LandingPadInst::cloneImpl() const {
   return new LandingPadInst(*this);
 }
 
 ReturnInst *ReturnInst::cloneImpl() const {
-  return new(getNumOperands()) ReturnInst(*this);
+  IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+  return new (AllocMarker) ReturnInst(*this, AllocMarker);
 }
 
 BranchInst *BranchInst::cloneImpl() const {
-  return new(getNumOperands()) BranchInst(*this);
+  IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+  return new (AllocMarker) BranchInst(*this, AllocMarker);
 }
 
 SwitchInst *SwitchInst::cloneImpl() const { return new SwitchInst(*this); }
@@ -4353,28 +4344,37 @@ IndirectBrInst *IndirectBrInst::cloneImpl() const {
 
 InvokeInst *InvokeInst::cloneImpl() const {
   if (hasOperandBundles()) {
-    unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo);
-    return new(getNumOperands(), DescriptorBytes) InvokeInst(*this);
+    IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+        getNumOperands(),
+        getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))};
+    return new (AllocMarker) InvokeInst(*this, AllocMarker);
   }
-  return new(getNumOperands()) InvokeInst(*this);
+  IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+  return new (AllocMarker) InvokeInst(*this, AllocMarker);
 }
 
 CallBrInst *CallBrInst::cloneImpl() const {
   if (hasOperandBundles()) {
-    unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo);
-    return new (getNumOperands(), DescriptorBytes) CallBrInst(*this);
+    IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+        getNumOperands(),
+        getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))};
+    return new (AllocMarker) CallBrInst(*this, AllocMarker);
   }
-  return new (getNumOperands()) CallBrInst(*this);
+  IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+  return new (AllocMarker) CallBrInst(*this, AllocMarker);
 }
 
-ResumeInst *ResumeInst::cloneImpl() const { return new (1) ResumeInst(*this); }
+ResumeInst *ResumeInst::cloneImpl() const {
+  return new (AllocMarker) ResumeInst(*this);
+}
 
 CleanupReturnInst *CleanupReturnInst::cloneImpl() const {
-  return new (getNumOperands()) CleanupReturnInst(*this);
+  IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+  return new (AllocMarker) CleanupReturnInst(*this, AllocMarker);
 }
 
 CatchReturnInst *CatchReturnInst::cloneImpl() const {
-  return new (getNumOperands()) CatchReturnInst(*this);
+  return new (AllocMarker) CatchReturnInst(*this);
 }
 
 CatchSwitchInst *CatchSwitchInst::cloneImpl() const {
@@ -4382,7 +4382,8 @@ CatchSwitchInst *CatchSwitchInst::cloneImpl() const {
 }
 
 FuncletPadInst *FuncletPadInst::cloneImpl() const {
-  return new (getNumOperands()) FuncletPadInst(*this);
+  IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+  return new (AllocMarker) FuncletPadInst(*this, AllocMarker);
 }
 
 UnreachableInst *UnreachableInst::cloneImpl() const {
diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp
index 00dd9c7..b0aa785 100644
--- a/llvm/lib/IR/User.cpp
+++ b/llvm/lib/IR/User.cpp
@@ -145,10 +145,7 @@ void *User::allocateFixedOperandUser(size_t Size, unsigned Us,
       ::operator new(Size + sizeof(Use) * Us + DescBytesToAllocate));
   Use *Start = reinterpret_cast<Use *>(Storage + DescBytesToAllocate);
   Use *End = Start + Us;
-  User *Obj = reinterpret_cast<User*>(End);
-  Obj->NumUserOperands = Us;
-  Obj->HasHungOffUses = false;
-  Obj->HasDescriptor = DescBytes != 0;
+  User *Obj = reinterpret_cast<User *>(End);
   for (; Start != End; Start++)
     new (Start) Use(Obj);
 
@@ -160,22 +157,21 @@ void *User::allocateFixedOperandUser(size_t Size, unsigned Us,
   return Obj;
 }
 
-void *User::operator new(size_t Size, unsigned Us) {
-  return allocateFixedOperandUser(Size, Us, 0);
+void *User::operator new(size_t Size, IntrusiveOperandsAllocMarker allocTrait) {
+  return allocateFixedOperandUser(Size, allocTrait.NumOps, 0);
 }
 
-void *User::operator new(size_t Size, unsigned Us, unsigned DescBytes) {
-  return allocateFixedOperandUser(Size, Us, DescBytes);
+void *User::operator new(size_t Size,
+                         IntrusiveOperandsAndDescriptorAllocMarker allocTrait) {
+  return allocateFixedOperandUser(Size, allocTrait.NumOps,
+                                  allocTrait.DescBytes);
 }
 
-void *User::operator new(size_t Size) {
+void *User::operator new(size_t Size, HungOffOperandsAllocMarker) {
   // Allocate space for a single Use*
   void *Storage = ::operator new(Size + sizeof(Use *));
   Use **HungOffOperandList = static_cast<Use **>(Storage);
   User *Obj = reinterpret_cast<User *>(HungOffOperandList + 1);
-  Obj->NumUserOperands = 0;
-  Obj->HasHungOffUses = true;
-  Obj->HasDescriptor = false;
   *HungOffOperandList = nullptr;
   return Obj;
 }
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 6ebf262..8f151a9 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1181,8 +1181,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   // Enable contextual profiling instrumentation.
   const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink &&
                             PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled();
-  const bool IsCtxProfUse = !UseCtxProfile.empty() && !PGOOpt &&
-                            Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
+  const bool IsCtxProfUse =
+      !UseCtxProfile.empty() && Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
 
   if (IsPGOInstrGen || IsPGOInstrUse || IsMemprofUse || IsCtxProfGen ||
       IsCtxProfUse)
@@ -1673,7 +1673,7 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
   // In pre-link, for ctx prof use, we stop here with an instrumented IR. We let
   // thinlto use the contextual info to perform imports; then use the contextual
   // profile in the post-thinlink phase.
-  if (!UseCtxProfile.empty() && !PGOOpt) {
+  if (!UseCtxProfile.empty()) {
     addRequiredLTOPreLinkPasses(MPM);
     return MPM;
   }
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 18fdcda..df38395 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -662,6 +662,12 @@ Value *SelectInst::create(Value *Cond, Value *True, Value *False,
   return createCommon(Cond, True, False, Name, Builder, Ctx);
 }
 
+void SelectInst::swapValues() {
+  Ctx.getTracker().emplaceIfTracking<UseSwap>(getOperandUse(1),
+                                              getOperandUse(2));
+  cast<llvm::SelectInst>(Val)->swapValues();
+}
+
 bool SelectInst::classof(const Value *From) {
   return From->getSubclassID() == ClassID::Select;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4da3618..9f89264 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -353,23 +353,20 @@ bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
 }
 
 /// Match an fpext from half to float, or a constant we can convert.
-static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
-  if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc)))))
-    return FPExtSrc->getType()->isHalfTy();
-
-  ConstantFP *CFP;
-  if (match(Arg, m_ConstantFP(CFP))) {
+static Value *matchFPExtFromF16(Value *Arg) {
+  Value *Src = nullptr;
+  ConstantFP *CFP = nullptr;
+  if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
+    if (Src->getType()->isHalfTy())
+      return Src;
+  } else if (match(Arg, m_ConstantFP(CFP))) {
     bool LosesInfo;
     APFloat Val(CFP->getValueAPF());
     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
-    if (LosesInfo)
-      return false;
-
-    FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
-    return true;
+    if (!LosesInfo)
+      return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
   }
-
-  return false;
+  return nullptr;
 }
 
 // Trim all zero components from the end of the vector \p UseV and return
@@ -839,15 +836,16 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (!ST->hasMed3_16())
       break;
 
-    Value *X, *Y, *Z;
-
     // Repeat floating-point width reduction done for minnum/maxnum.
     // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
-    if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) &&
-        matchFPExtFromF16(Src2, Z)) {
-      Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()},
-                                                  {X, Y, Z}, &II, II.getName());
-      return new FPExtInst(NewCall, II.getType());
+    if (Value *X = matchFPExtFromF16(Src0)) {
+      if (Value *Y = matchFPExtFromF16(Src1)) {
+        if (Value *Z = matchFPExtFromF16(Src2)) {
+          Value *NewCall = IC.Builder.CreateIntrinsic(
+              IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
+          return new FPExtInst(NewCall, II.getType());
+        }
+      }
     }
 
     break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 56f4efd..e657f66 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5439,6 +5439,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
 
   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
                       IID == Intrinsic::amdgcn_permlanex16;
+  bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
+                       IID == Intrinsic::amdgcn_set_inactive_chain_arg;
 
   auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
                                       Register Src2, LLT VT) -> Register {
@@ -5448,6 +5450,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     case Intrinsic::amdgcn_permlane64:
       return LaneOp.getReg(0);
     case Intrinsic::amdgcn_readlane:
+    case Intrinsic::amdgcn_set_inactive:
+    case Intrinsic::amdgcn_set_inactive_chain_arg:
       return LaneOp.addUse(Src1).getReg(0);
     case Intrinsic::amdgcn_writelane:
       return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
@@ -5472,7 +5476,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   Register Src0 = MI.getOperand(2).getReg();
   Register Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
-      IsPermLane16) {
+      IsSetInactive || IsPermLane16) {
     Src1 = MI.getOperand(3).getReg();
     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
       Src2 = MI.getOperand(4).getReg();
@@ -5490,7 +5494,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   if (Size < 32) {
     Src0 = B.buildAnyExt(S32, Src0).getReg(0);
 
-    if (IsPermLane16)
+    if (IsSetInactive || IsPermLane16)
       Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
 
     if (IID == Intrinsic::amdgcn_writelane)
@@ -5526,7 +5530,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
   MachineInstrBuilder Src1Parts, Src2Parts;
 
-  if (IsPermLane16)
+  if (IsSetInactive || IsPermLane16)
     Src1Parts = B.buildUnmerge(PartialResTy, Src1);
 
   if (IID == Intrinsic::amdgcn_writelane)
@@ -5535,7 +5539,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   for (unsigned i = 0; i < NumParts; ++i) {
     Src0 = Src0Parts.getReg(i);
 
-    if (IsPermLane16)
+    if (IsSetInactive || IsPermLane16)
       Src1 = Src1Parts.getReg(i);
 
     if (IID == Intrinsic::amdgcn_writelane)
@@ -7496,6 +7500,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_permlane16:
   case Intrinsic::amdgcn_permlanex16:
   case Intrinsic::amdgcn_permlane64:
+  case Intrinsic::amdgcn_set_inactive:
+  case Intrinsic::amdgcn_set_inactive_chain_arg:
     return legalizeLaneOp(Helper, MI, IntrID);
   case Intrinsic::amdgcn_s_buffer_prefetch_data:
     return legalizeSBufferPrefetch(Helper, MI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9a6d617..62e22c1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6102,6 +6102,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   unsigned IID = N->getConstantOperandVal(0);
   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
                       IID == Intrinsic::amdgcn_permlanex16;
+  bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
+                       IID == Intrinsic::amdgcn_set_inactive_chain_arg;
   SDLoc SL(N);
   MVT IntVT = MVT::getIntegerVT(ValSize);
 
@@ -6119,6 +6121,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
       Operands.push_back(Src2);
       [[fallthrough]];
     case Intrinsic::amdgcn_readlane:
+    case Intrinsic::amdgcn_set_inactive:
+    case Intrinsic::amdgcn_set_inactive_chain_arg:
       Operands.push_back(Src1);
       [[fallthrough]];
     case Intrinsic::amdgcn_readfirstlane:
@@ -6145,7 +6149,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   SDValue Src0 = N->getOperand(1);
   SDValue Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
-      IsPermLane16) {
+      IsSetInactive || IsPermLane16) {
     Src1 = N->getOperand(2);
     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
       Src2 = N->getOperand(3);
@@ -6161,7 +6165,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
     Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
                                 SL, MVT::i32);
 
-    if (IsPermLane16) {
+    if (IsSetInactive || IsPermLane16) {
       Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
                                   SL, MVT::i32);
     }
@@ -6237,7 +6241,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
         Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
                                  DAG.getConstant(EltIdx, SL, MVT::i32));
 
-        if (IsPermLane16)
+        if (IsSetInactive || IsPermLane16)
           Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
                                    DAG.getConstant(EltIdx, SL, MVT::i32));
 
@@ -6246,7 +6250,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
                                    DAG.getConstant(EltIdx, SL, MVT::i32));
 
         Pieces.push_back(
-            IsPermLane16
+            IsSetInactive || IsPermLane16
                 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
                 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
         EltIdx += 2;
@@ -6262,7 +6266,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
   Src0 = DAG.getBitcast(VecVT, Src0);
 
-  if (IsPermLane16)
+  if (IsSetInactive || IsPermLane16)
     Src1 = DAG.getBitcast(VecVT, Src1);
 
   if (IID == Intrinsic::amdgcn_writelane)
@@ -8745,6 +8749,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_permlane16:
   case Intrinsic::amdgcn_permlanex16:
   case Intrinsic::amdgcn_permlane64:
+  case Intrinsic::amdgcn_set_inactive:
+  case Intrinsic::amdgcn_set_inactive_chain_arg:
     return lowerLaneOp(*this, Op.getNode(), DAG);
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 899d937..e4a679f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2097,21 +2097,6 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
   }
 }
 
-Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) {
-  assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
-         MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64);
-  for (auto &Op : MI.implicit_operands()) {
-    if (Op.isDef())
-      continue;
-    Register OpReg = Op.getReg();
-    if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
-        OpReg == AMDGPU::SCC)
-      continue;
-    return OpReg;
-  }
-  return Register();
-}
-
 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
@@ -2286,147 +2271,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
-  case AMDGPU::V_SET_INACTIVE_B32:
-  case AMDGPU::V_SET_INACTIVE_B64: {
-    unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
-    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
-                           ? AMDGPU::V_MOV_B64_PSEUDO
-                           : AMDGPU::V_MOV_B32_e32;
-    Register ExecReg = RI.getExec();
+  case AMDGPU::V_SET_INACTIVE_B32: {
+    // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
     Register DstReg = MI.getOperand(0).getReg();
-    MachineOperand &ActiveSrc = MI.getOperand(1);
-    MachineOperand &InactiveSrc = MI.getOperand(2);
-
-    // Find implicit register defining lanes active outside WWM.
-    Register ExecSrcReg = findSetInactiveMask(MI);
-    assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
-    // Note: default here is set to ExecReg so that functional MIR is still
-    // generated if implicit def is not found and assertions are disabled.
-    if (!ExecSrcReg)
-      ExecSrcReg = ExecReg;
-
-    // Ideally in WWM this operation is lowered to V_CNDMASK; however,
-    // constant bus constraints and the presence of literal constants
-    // present an issue.
-    // Fallback to V_MOV base lowering in all but the common cases.
-    const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
-    MachineFunction *MF = MBB.getParent();
-    MachineRegisterInfo &MRI = MF->getRegInfo();
-    const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
-    const MCInstrDesc &Desc = get(Opcode);
-
-    const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
-    const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
-    const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
-    const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
-    const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
-    const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
-
-    int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
-    int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
-
-    int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
-    int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
-    int ConstantBusUses =
-        1 + // Starts at 1 for ExecSrcReg
-        (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
-        (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
-    int LiteralConstants =
-        ((ActiveSrc.isReg() ||
-          (ActiveSrc.isImm() && isInlineConstant(ActiveImm)))
-             ? 0
-             : 1) +
-        ((InactiveSrc.isReg() ||
-          (InactiveSrc.isImm() && isInlineConstant(InactiveImm)))
-             ? 0
-             : 1);
-
-    bool UseVCndMask =
-        ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
-    if (VMov64 && UseVCndMask) {
-      // Decomposition must not introduce new literals.
-      UseVCndMask &=
-          ActiveSrc.isReg() ||
-          (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) ||
-          (!isInlineConstant(ActiveImm));
-      UseVCndMask &= InactiveSrc.isReg() ||
-                     (isInlineConstant(InactiveImmLo) &&
-                      isInlineConstant(InactiveImmHi)) ||
-                     (!isInlineConstant(InactiveImm));
-    }
-
-    if (UseVCndMask && VMov64) {
-      // Dual V_CNDMASK_B32
-      MachineOperand ActiveLo = buildExtractSubRegOrImm(
-          MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr);
-      MachineOperand ActiveHi = buildExtractSubRegOrImm(
-          MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr);
-      MachineOperand InactiveLo = buildExtractSubRegOrImm(
-          MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr);
-      MachineOperand InactiveHi = buildExtractSubRegOrImm(
-          MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr);
-      if (ActiveSrc.isReg())
-        ActiveHi.setIsKill(ActiveSrc.isKill());
-      if (InactiveSrc.isReg())
-        InactiveHi.setIsKill(InactiveSrc.isKill());
-      BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0))
-          .addImm(0)
-          .add(InactiveLo)
-          .addImm(0)
-          .add(ActiveLo)
-          .addReg(ExecSrcReg)
-          .addReg(DstReg, RegState::ImplicitDefine);
-      BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1))
-          .addImm(0)
-          .add(InactiveHi)
-          .addImm(0)
-          .add(ActiveHi)
-          .addReg(ExecSrcReg)
-          .addReg(DstReg, RegState::ImplicitDefine);
-    } else if (UseVCndMask) {
-      // Single V_CNDMASK_B32
-      BuildMI(MBB, MI, DL, Desc, DstReg)
-          .addImm(0)
-          .add(InactiveSrc)
-          .addImm(0)
-          .add(ActiveSrc)
-          .addReg(ExecSrcReg);
-    } else {
-      // Fallback V_MOV case.
-      // Avoid unnecessary work if a source VGPR is also the destination.
-      // This can happen if WWM register allocation was efficient.
-      // Note: this assumes WWM execution.
-      bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
-      bool DstIsInactive =
-          InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
-      if (!DstIsInactive) {
-        // Set exec mask to inactive lanes,
-        // but only if active lanes would be overwritten.
-        if (DstIsActive) {
-          BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
-              .addReg(ExecSrcReg)
-              .setOperandDead(3); // Dead scc
-        }
-        // Copy inactive lanes
-        MachineInstr *VMov =
-            BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
-        if (VMov64)
-          expandPostRAPseudo(*VMov);
-      }
-      if (!DstIsActive) {
-        // Set exec mask to active lanes
-        BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
-        // Copy active lanes
-        MachineInstr *VMov =
-            BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
-                .add(ActiveSrc);
-        if (VMov64)
-          expandPostRAPseudo(*VMov);
-      }
-      // Restore WWM
-      BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
-    }
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .add(MI.getOperand(3))
+        .add(MI.getOperand(4))
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(2))
+        .add(MI.getOperand(5));
     MI.eraseFromParent();
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 7143251..4fd9b43 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1437,8 +1437,6 @@ public:
   // This is used if an operand is a 32 bit register but needs to be aligned
   // regardless.
   void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
-
-  static Register findSetInactiveMask(const MachineInstr &MI);
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f9f0796..284be72 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -243,29 +243,16 @@ def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))),
 
 // Invert the exec mask and overwrite the inactive lanes of dst with inactive,
 // restoring it after we're done.
-let Defs = [SCC], isConvergent = 1 in {
-def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
-  (ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
-
-def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
-  (ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
-} // End Defs = [SCC]
+let isConvergent = 1 in
+def V_SET_INACTIVE_B32 : VOP3_Pseudo<"v_set_inactive_b32", VOP2e_I32_I32_I32_I1>;
 
 foreach vt = Reg32Types.types in {
 def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
-     (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
-}
-
-foreach vt = Reg64Types.types in {
-def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
-     (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
+     (V_SET_INACTIVE_B32 0, VSrc_b32:$src, 0, VSrc_b32:$inactive, (IMPLICIT_DEF))>;
 }
 
 def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
-    (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
-
-def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)),
-    (V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>;
+    (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
 
 let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
   def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 29fef49..3bf2ea0 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -215,8 +215,7 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock *MBB : RPOT) {
     bool InWWM = false;
     for (MachineInstr &MI : *MBB) {
-      if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
-          MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
+      if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32)
         RegsAssigned |= processDef(MI.getOperand(0));
 
       if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 38ebda6..8cedc34 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -557,26 +557,18 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         // This avoid unnecessarily marking M0 as requiring WQM.
         III.Needs |= StateStrictWQM;
         GlobalFlags |= StateStrictWQM;
-      } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
-                 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
-        // Ignore these if V_SET_INACTIVE which already has exec src register.
-        // These are generated by an earlier pass which has seperately ensured
-        // WWM and provided a mask of inactive lanes.
-        Register ExecSrc = TII->findSetInactiveMask(MI);
-        if (!ExecSrc) {
-          // Disable strict states; StrictWQM will be added as required later.
-          III.Disabled = StateStrict;
-          MachineOperand &Inactive = MI.getOperand(2);
-          if (Inactive.isReg()) {
-            if (Inactive.isUndef()) {
-              LowerToCopyInstrs.insert(&MI);
-            } else {
-              markOperand(MI, Inactive, StateStrictWWM, Worklist);
-            }
-          }
-          SetInactiveInstrs.push_back(&MI);
-          BBI.NeedsLowering = true;
+      } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
+        // Disable strict states; StrictWQM will be added as required later.
+        III.Disabled = StateStrict;
+        MachineOperand &Inactive = MI.getOperand(4);
+        if (Inactive.isReg()) {
+          if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
+            LowerToCopyInstrs.insert(&MI);
+          else
+            markOperand(MI, Inactive, StateStrictWWM, Worklist);
         }
+        SetInactiveInstrs.push_back(&MI);
+        BBI.NeedsLowering = true;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
@@ -1078,10 +1070,11 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
       ActiveLanesReg = 0;
       break;
     case AMDGPU::V_SET_INACTIVE_B32:
-    case AMDGPU::V_SET_INACTIVE_B64:
       if (ActiveLanesReg) {
-        MI.addOperand(*MBB.getParent(),
-                      MachineOperand::CreateReg(ActiveLanesReg, false, true));
+        LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
+        MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass());
+        MI.getOperand(5).setReg(ActiveLanesReg);
+        LIS->shrinkToUses(&LI);
       } else {
         assert(State == StateExact || State == StateWQM);
       }
@@ -1527,15 +1520,17 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
   for (MachineInstr *MI : LowerToCopyInstrs) {
     LLVM_DEBUG(dbgs() << "simplify: " << *MI);
 
-    if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
-        MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
-      assert(MI->getNumExplicitOperands() == 3);
+    if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
+      assert(MI->getNumExplicitOperands() == 6);
 
       LiveInterval *RecomputeLI = nullptr;
-      if (MI->getOperand(2).isReg())
-        RecomputeLI = &LIS->getInterval(MI->getOperand(2).getReg());
+      if (MI->getOperand(4).isReg())
+        RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg());
 
-      MI->removeOperand(2);
+      MI->removeOperand(5);
+      MI->removeOperand(4);
+      MI->removeOperand(3);
+      MI->removeOperand(1);
 
       if (RecomputeLI)
         LIS->shrinkToUses(RecomputeLI);
@@ -1547,12 +1542,6 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
                           ? (unsigned)AMDGPU::COPY
                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
                                 *MRI, MI->getOperand(0)));
-    int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    while (Index >= 0) {
-      MI->removeOperand(Index);
-      Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    }
-
     MI->setDesc(TII->get(CopyOp));
     LLVM_DEBUG(dbgs() << " -> " << *MI);
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 1205ad4..082546c4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -139,16 +139,21 @@
 #include "NVPTX.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <numeric>
 #include <queue>
 
@@ -217,7 +222,8 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args",
 // pointer in parameter AS.
 // For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to
 // generic using cvta.param.
-static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
+static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam,
+                             bool IsGridConstant) {
   Instruction *I = dyn_cast<Instruction>(OldUse->getUser());
   assert(I && "OldUse must be in an instruction");
   struct IP {
@@ -228,7 +234,8 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
   SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}};
   SmallVector<Instruction *> InstructionsToDelete;
 
-  auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * {
+  auto CloneInstInParamAS = [HasCvtaParam,
+                             IsGridConstant](const IP &I) -> Value * {
     if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {
       LI->setOperand(0, I.NewParam);
       return LI;
@@ -252,8 +259,25 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
       // Just pass through the argument, the old ASC is no longer needed.
       return I.NewParam;
     }
+    if (auto *MI = dyn_cast<MemTransferInst>(I.OldInstruction)) {
+      if (MI->getRawSource() == I.OldUse->get()) {
+        // convert to memcpy/memmove from param space.
+        IRBuilder<> Builder(I.OldInstruction);
+        Intrinsic::ID ID = MI->getIntrinsicID();
+
+        CallInst *B = Builder.CreateMemTransferInst(
+            ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam,
+            MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
+        for (unsigned I : {0, 1})
+          if (uint64_t Bytes = MI->getParamDereferenceableBytes(I))
+            B->addDereferenceableParamAttr(I, Bytes);
+        return B;
+      }
+      // We may be able to handle other cases if the argument is
+      // __grid_constant__
+    }
 
-    if (GridConstant) {
+    if (HasCvtaParam) {
       auto GetParamAddrCastToGeneric =
           [](Value *Addr, Instruction *OriginalUser) -> Value * {
         PointerType *ReturnTy =
@@ -269,24 +293,44 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
                              OriginalUser->getIterator());
         return CvtToGenCall;
       };
-
-      if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
-        I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI));
-        return CI;
+      auto *ParamInGenericAS =
+          GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction);
+
+      // phi/select could use generic arg pointers w/o __grid_constant__
+      if (auto *PHI = dyn_cast<PHINode>(I.OldInstruction)) {
+        for (auto [Idx, V] : enumerate(PHI->incoming_values())) {
+          if (V.get() == I.OldUse->get())
+            PHI->setIncomingValue(Idx, ParamInGenericAS);
+        }
       }
-      if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
-        // byval address is being stored, cast it to generic
-        if (SI->getValueOperand() == I.OldUse->get())
-          SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI));
-        return SI;
+      if (auto *SI = dyn_cast<SelectInst>(I.OldInstruction)) {
+        if (SI->getTrueValue() == I.OldUse->get())
+          SI->setTrueValue(ParamInGenericAS);
+        if (SI->getFalseValue() == I.OldUse->get())
+          SI->setFalseValue(ParamInGenericAS);
       }
-      if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
-        if (PI->getPointerOperand() == I.OldUse->get())
-          PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI));
-        return PI;
+
+      // Escapes or writes can only use generic param pointers if
+      // __grid_constant__ is in effect.
+      if (IsGridConstant) {
+        if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
+          I.OldUse->set(ParamInGenericAS);
+          return CI;
+        }
+        if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
+          // byval address is being stored, cast it to generic
+          if (SI->getValueOperand() == I.OldUse->get())
+            SI->setOperand(0, ParamInGenericAS);
+          return SI;
+        }
+        if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
+          if (PI->getPointerOperand() == I.OldUse->get())
+            PI->setOperand(0, ParamInGenericAS);
+          return PI;
+        }
+        // TODO: iIf we allow stores, we should allow memcpy/memset to
+        // parameter, too.
       }
-      llvm_unreachable(
-          "Instruction unsupported even for grid_constant argument");
     }
 
     llvm_unreachable("Unsupported instruction");
@@ -409,49 +453,110 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
   }
 }
 
+namespace {
+struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
+  using Base = PtrUseVisitor<ArgUseChecker>;
+
+  bool IsGridConstant;
+  // Set of phi/select instructions using the Arg
+  SmallPtrSet<Instruction *, 4> Conditionals;
+
+  ArgUseChecker(const DataLayout &DL, bool IsGridConstant)
+      : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {}
+
+  PtrInfo visitArgPtr(Argument &A) {
+    assert(A.getType()->isPointerTy());
+    IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(A.getType()));
+    IsOffsetKnown = false;
+    Offset = APInt(IntIdxTy->getBitWidth(), 0);
+    PI.reset();
+    Conditionals.clear();
+
+    LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n");
+    // Enqueue the uses of this pointer.
+    enqueueUsers(A);
+
+    // Visit all the uses off the worklist until it is empty.
+    // Note that unlike PtrUseVisitor we intentionally do not track offsets.
+    // We're only interested in how we use the pointer.
+    while (!(Worklist.empty() || PI.isAborted())) {
+      UseToVisit ToVisit = Worklist.pop_back_val();
+      U = ToVisit.UseAndIsOffsetKnown.getPointer();
+      Instruction *I = cast<Instruction>(U->getUser());
+      if (isa<PHINode>(I) || isa<SelectInst>(I))
+        Conditionals.insert(I);
+      LLVM_DEBUG(dbgs() << "Processing " << *I << "\n");
+      Base::visit(I);
+    }
+    if (PI.isEscaped())
+      LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst()
+                        << "\n");
+    else if (PI.isAborted())
+      LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst()
+                        << "\n");
+    LLVM_DEBUG(dbgs() << "Traversed " << Conditionals.size()
+                      << " conditionals\n");
+    return PI;
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    // Storing the pointer escapes it.
+    if (U->get() == SI.getValueOperand())
+      return PI.setEscapedAndAborted(&SI);
+    // Writes to the pointer are UB w/ __grid_constant__, but do not force a
+    // copy.
+    if (!IsGridConstant)
+      return PI.setAborted(&SI);
+  }
+
+  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+    // ASC to param space are no-ops and do not need a copy
+    if (ASC.getDestAddressSpace() != ADDRESS_SPACE_PARAM)
+      return PI.setEscapedAndAborted(&ASC);
+    Base::visitAddrSpaceCastInst(ASC);
+  }
+
+  void visitPtrToIntInst(PtrToIntInst &I) {
+    if (IsGridConstant)
+      return;
+    Base::visitPtrToIntInst(I);
+  }
+  void visitPHINodeOrSelectInst(Instruction &I) {
+    assert(isa<PHINode>(I) || isa<SelectInst>(I));
+  }
+  // PHI and select just pass through the pointers.
+  void visitPHINode(PHINode &PN) { enqueueUsers(PN); }
+  void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); }
+
+  void visitMemTransferInst(MemTransferInst &II) {
+    if (*U == II.getRawDest() && !IsGridConstant)
+      PI.setAborted(&II);
+    // memcpy/memmove are OK when the pointer is source. We can convert them to
+    // AS-specific memcpy.
+  }
+
+  void visitMemSetInst(MemSetInst &II) {
+    if (!IsGridConstant)
+      PI.setAborted(&II);
+  }
+}; // struct ArgUseChecker
+} // namespace
+
 void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
                                       Argument *Arg) {
-  bool IsGridConstant = isParamGridConstant(*Arg);
   Function *Func = Arg->getParent();
+  bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();
+  bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
+  const DataLayout &DL = Func->getDataLayout();
   BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
   Type *StructType = Arg->getParamByValType();
   assert(StructType && "Missing byval type");
 
-  auto AreSupportedUsers = [&](Value *Start) {
-    SmallVector<Value *, 16> ValuesToCheck = {Start};
-    auto IsSupportedUse = [IsGridConstant](Value *V) -> bool {
-      if (isa<GetElementPtrInst>(V) || isa<BitCastInst>(V) || isa<LoadInst>(V))
-        return true;
-      // ASC to param space are OK, too -- we'll just strip them.
-      if (auto *ASC = dyn_cast<AddrSpaceCastInst>(V)) {
-        if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM)
-          return true;
-      }
-      // Simple calls and stores are supported for grid_constants
-      // writes to these pointers are undefined behaviour
-      if (IsGridConstant &&
-          (isa<CallInst>(V) || isa<StoreInst>(V) || isa<PtrToIntInst>(V)))
-        return true;
-      return false;
-    };
-
-    while (!ValuesToCheck.empty()) {
-      Value *V = ValuesToCheck.pop_back_val();
-      if (!IsSupportedUse(V)) {
-        LLVM_DEBUG(dbgs() << "Need a "
-                          << (isParamGridConstant(*Arg) ? "cast " : "copy ")
-                          << "of " << *Arg << " because of " << *V << "\n");
-        (void)Arg;
-        return false;
-      }
-      if (!isa<LoadInst>(V) && !isa<CallInst>(V) && !isa<StoreInst>(V) &&
-          !isa<PtrToIntInst>(V))
-        llvm::append_range(ValuesToCheck, V->users());
-    }
-    return true;
-  };
-
-  if (llvm::all_of(Arg->users(), AreSupportedUsers)) {
+  ArgUseChecker AUC(DL, IsGridConstant);
+  ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);
+  bool ArgUseIsReadOnly  = !(PI.isEscaped() || PI.isAborted());
+  // Easy case, accessing parameter directly is fine.
+  if (ArgUseIsReadOnly && AUC.Conditionals.empty()) {
     // Convert all loads and intermediate operations to use parameter AS and
     // skip creation of a local copy of the argument.
     SmallVector<Use *, 16> UsesToUpdate;
@@ -462,7 +567,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
         Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
         FirstInst);
     for (Use *U : UsesToUpdate)
-      convertToParamAS(U, ArgInParamAS, IsGridConstant);
+      convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
     LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
 
     const auto *TLI =
@@ -473,13 +578,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
     return;
   }
 
-  const DataLayout &DL = Func->getDataLayout();
+  // We can't access byval arg directly and need a pointer. on sm_70+ we have
+  // ability to take a pointer to the argument without making a local copy.
+  // However, we're still not allowed to write to it. If the user specified
+  // `__grid_constant__` for the argument, we'll consider escaped pointer as
+  // read-only.
   unsigned AS = DL.getAllocaAddrSpace();
-  if (isParamGridConstant(*Arg)) {
-    // Writes to a grid constant are undefined behaviour. We do not need a
-    // temporary copy. When a pointer might have escaped, conservatively replace
-    // all of its uses (which might include a device function call) with a cast
-    // to the generic address space.
+  if (HasCvtaParam && (ArgUseIsReadOnly || IsGridConstant)) {
+    LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
+    // Replace all argument pointer uses (which might include a device function
+    // call) with a cast to the generic address space using cvta.param
+    // instruction, which avoids a local copy.
     IRBuilder<> IRB(&Func->getEntryBlock().front());
 
     // Cast argument to param address space
@@ -500,6 +609,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
     // Do not replace Arg in the cast to param space
     CastToParam->setOperand(0, Arg);
   } else {
+    LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n");
     // Otherwise we have to create a temporary copy.
     AllocaInst *AllocA =
         new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 0591782..457f10f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -94,6 +94,7 @@ public:
   bool hasDotInstructions() const {
     return SmVersion >= 61 && PTXVersion >= 50;
   }
+  bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
   // GPUs with "a" suffix have include architecture-accelerated features that
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d2e6257..4554163 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1106,18 +1106,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         if (!isTypeLegal(VT))
           continue;
         setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
-        setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
         setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
                            Custom);
+        setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
+        setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT,
+                           Custom);
+        setOperationAction(ISD::SELECT_CC, VT, Expand);
         setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
                             ISD::EXTRACT_SUBVECTOR},
                            VT, Custom);
-        setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
         if (Subtarget.hasStdExtZfbfmin())
           setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
-        setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT,
-                           Custom);
-        setOperationAction(ISD::SELECT_CC, VT, Expand);
+        setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
         // TODO: Promote to fp32.
       }
     }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 13212c2..02f65ff 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2918,7 +2918,7 @@ RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
     // if any possible.
     if (MO.getTargetFlags() == RISCVII::MO_PCREL_LO &&
         (MI.getMF()->getTarget().getFunctionSections() || F.hasComdat() ||
-         F.hasSection()))
+         F.hasSection() || F.getSectionPrefix()))
       return outliner::InstrType::Illegal;
   }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 5cc084f..1875a8f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -96,6 +96,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       setOperationAction(ISD::STORE, T, Custom);
     }
   }
+  if (Subtarget->hasFP16()) {
+    setOperationAction(ISD::LOAD, MVT::v8f16, Custom);
+    setOperationAction(ISD::STORE, MVT::v8f16, Custom);
+  }
   if (Subtarget->hasReferenceTypes()) {
     // We need custom load and store lowering for both externref, funcref and
     // Other. The MVT::Other here represents tables of reference types.
@@ -208,6 +212,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
                    MVT::v2f64})
       setOperationAction(ISD::BUILD_VECTOR, T, Custom);
 
+    if (Subtarget->hasFP16())
+      setOperationAction(ISD::BUILD_VECTOR, MVT::f16, Custom);
+
     // We have custom shuffle lowering to expose the shuffle mask
     for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
                    MVT::v2f64})
@@ -2055,6 +2062,18 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
 
 SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                                      SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  if (VT == MVT::v8f16) {
+    // BUILD_VECTOR can't handle FP16 operands since Wasm doesn't have a scaler
+    // FP16 type, so cast them to I16s.
+    MVT IVT = VT.changeVectorElementType(MVT::i16);
+    SmallVector<SDValue, 8> NewOps;
+    for (unsigned I = 0, E = Op.getNumOperands(); I < E; ++I)
+      NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
+    SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
+    return DAG.getBitcast(VT, Res);
+  }
+
   if (auto ConvertLow = LowerConvertLow(Op, DAG))
     return ConvertLow;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 9d17d90..9be23da 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -189,7 +189,7 @@ defm LOAD_V128_A64 :
 }
 
 // Def load patterns from WebAssemblyInstrMemory.td for vector types
-foreach vec = StdVecs in {
+foreach vec = AllVecs in {
 defm : LoadPat<vec.vt, load, "LOAD_V128">;
 }
 
@@ -390,7 +390,7 @@ defm STORE_V128_A64 :
 }
 
 // Def store patterns from WebAssemblyInstrMemory.td for vector types
-foreach vec = StdVecs in {
+foreach vec = AllVecs in {
 defm : StorePat<vec.vt, store, "STORE_V128">;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b821da0..3b6b154 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1182,13 +1182,23 @@ public:
                            InstructionCost Cost) {
     assert(VF.isVector() && "Expected VF >=2");
     /// Broadcast this decicion to all instructions inside the group.
-    /// But the cost will be assigned to one instruction only.
+    /// When interleaving, the cost will only be assigned one instruction, the
+    /// insert position. For other cases, add the appropriate fraction of the
+    /// total cost to each instruction. This ensures accurate costs are used,
+    /// even if the insert position instruction is not used.
+    InstructionCost InsertPosCost = Cost;
+    InstructionCost OtherMemberCost = 0;
+    if (W != CM_Interleave)
+      OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
+    ;
     for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
       if (auto *I = Grp->getMember(Idx)) {
         if (Grp->getInsertPos() == I)
-          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+          WideningDecisions[std::make_pair(I, VF)] =
+              std::make_pair(W, InsertPosCost);
         else
-          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
+          WideningDecisions[std::make_pair(I, VF)] =
+              std::make_pair(W, OtherMemberCost);
       }
     }
   }
@@ -8663,6 +8673,7 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
          !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
         isa<VPWidenPointerInductionRecipe>(V) ||
         (isa<Instruction>(IncomingValue) &&
+         OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
          any_of(IncomingValue->users(), [&Inductions](User *U) {
            auto *P = dyn_cast<PHINode>(U);
            return P && Inductions.contains(P);
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
index e9be6f5..c5d2ebf 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
@@ -28,24 +28,24 @@ PreservedAnalyses SandboxVectorizerPass::run(Function &F,
   return PA;
 }
 
-bool SandboxVectorizerPass::runImpl(Function &F) {
+bool SandboxVectorizerPass::runImpl(Function &LLVMF) {
   // If the target claims to have no vector registers early return.
   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
     LLVM_DEBUG(dbgs() << "SBVec: Target has no vector registers, return.\n");
     return false;
   }
-  LLVM_DEBUG(dbgs() << "SBVec: Analyzing " << F.getName() << ".\n");
+  LLVM_DEBUG(dbgs() << "SBVec: Analyzing " << LLVMF.getName() << ".\n");
   // Early return if the attribute NoImplicitFloat is used.
-  if (F.hasFnAttribute(Attribute::NoImplicitFloat)) {
+  if (LLVMF.hasFnAttribute(Attribute::NoImplicitFloat)) {
     LLVM_DEBUG(dbgs() << "SBVec: NoImplicitFloat attribute, return.\n");
     return false;
   }
 
-  sandboxir::Context Ctx(F.getContext());
-  // Create SandboxIR for `F`.
-  sandboxir::Function &SBF = *Ctx.createFunction(&F);
+  sandboxir::Context Ctx(LLVMF.getContext());
+  // Create SandboxIR for `LLVMF`.
+  sandboxir::Function &F = *Ctx.createFunction(&LLVMF);
   // TODO: Initialize SBVec Pass Manager
-  (void)SBF;
+  (void)F;
 
   return false;
 }
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
index 29dce5f..57ae02a 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
@@ -82,26 +82,26 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 75 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 150 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
@@ -139,11 +139,11 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
index 0e7b1c5..2cc5150 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
@@ -108,34 +108,34 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 105 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
@@ -180,13 +180,13 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
index 8830aff..1899741 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
@@ -121,22 +121,22 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
-; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
-; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
@@ -145,14 +145,14 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4
-; AVX2:  LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
@@ -203,14 +203,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
-; AVX512:  LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
index cfd3d78..2d4b300 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
@@ -71,10 +71,10 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
index 5ec5b51..5dfb25e 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
@@ -88,12 +88,12 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
index 450743d..bd88ca8 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
@@ -107,14 +107,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
index 5e5c718..9c079863 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
@@ -72,21 +72,21 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
@@ -114,16 +114,16 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
index 62541fa2..99a735d 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
@@ -133,18 +133,18 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
index cfed855..168e916 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
@@ -94,27 +94,27 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
@@ -152,20 +152,20 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
index 07939b9..919a17e 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
@@ -105,30 +105,30 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
-; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
-; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
-; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
@@ -163,22 +163,22 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
-; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
-; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
index 964a9b6..6737c72 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -93,31 +93,31 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 330 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
index 6653198..46d56a7 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
@@ -123,41 +123,41 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2:  LV: Found an estimated cost of 231 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2:  LV: Found an estimated cost of 462 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
index b3a5cbe..4d65abd 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
@@ -138,46 +138,46 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
-; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
-; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
-; AVX2:  LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
-; AVX2:  LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
-; AVX2:  LV: Found an estimated cost of 528 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 66 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
index c0ea210..28a6443 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
@@ -73,8 +73,8 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX512:  LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
index 2a261ca..5cad7bf 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
@@ -82,26 +82,26 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
@@ -139,11 +139,11 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
index 8bf3071..cfb83d4 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
@@ -108,34 +108,34 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
@@ -180,13 +180,13 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
index 3182de2..7757854 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
@@ -121,22 +121,22 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
-; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
-; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
@@ -145,14 +145,14 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4
-; AVX2:  LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
@@ -203,14 +203,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
-; AVX512:  LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
index 27e2ee0..cf350cc 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
@@ -71,10 +71,10 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
index c377232..9ca0d8c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
@@ -88,12 +88,12 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
index 2eb7c5e..86ee6c8 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
@@ -107,14 +107,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
index c11da43..f6143d4 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
@@ -72,21 +72,21 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -114,16 +114,16 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
index de57af6..43dc53d 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
@@ -133,18 +133,18 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
index 949c1af..70ed74dc 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
@@ -94,27 +94,27 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 126 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -152,20 +152,20 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
index 4388ccf..401e4de 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
@@ -105,30 +105,30 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
-; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
-; AVX2:  LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
-; AVX2:  LV: Found an estimated cost of 144 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -163,22 +163,22 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
-; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
-; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
index 6078fb4..ef3c80c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
@@ -93,31 +93,31 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 325 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
index 778a4e7..8e7c316 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
@@ -123,41 +123,41 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2:  LV: Found an estimated cost of 224 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2:  LV: Found an estimated cost of 455 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
index a230b5a..752cc22 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
@@ -138,46 +138,46 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
-; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
-; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
-; AVX2:  LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
-; AVX2:  LV: Found an estimated cost of 256 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
-; AVX2:  LV: Found an estimated cost of 520 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 65 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
index c1a66c1..ed2bb3f 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
@@ -163,22 +163,22 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 46 for VF 4 For instruction: store double %v7, ptr %out7, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: store double %v7, ptr %out7, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v0, ptr %out0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v7, ptr %out7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
index 7be9577..a539877 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
@@ -163,22 +163,22 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 46 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
index 13a8442..41dd266 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
@@ -38,8 +38,8 @@ define void @test1(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 12 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
@@ -99,8 +99,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index 9c634ab..89c3bbc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -85,7 +85,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX908-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
-  ; GFX908-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec
   ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
   ; GFX908-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX908-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -159,7 +160,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX90A-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
-  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec
   ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
   ; GFX90A-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX90A-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -233,7 +235,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX940-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX940-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX940-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
-  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+  ; GFX940-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec
   ; GFX940-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
   ; GFX940-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX940-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -300,7 +303,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec
+  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY6]], [[DEF1]], implicit $exec
   ; GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; GFX11-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index fdce9d9..8eef3d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -77,7 +77,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX90A-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
-  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec
   ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
   ; GFX90A-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX90A-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -170,7 +171,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX940-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX940-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
-  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+  ; GFX940-NEXT:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec
   ; GFX940-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
   ; GFX940-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX940-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -255,7 +257,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec
+  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY6]], [[DEF1]], implicit $exec
   ; GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; GFX11-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 137366a..f015099 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -4,15 +4,13 @@
 define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: set_inactive:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -45,17 +43,15 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
 ; GCN-LABEL: set_inactive_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -89,22 +85,18 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 ; GCN-LABEL: set_inactive_scc:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
+; GCN-NEXT:    s_load_dword s8, s[2:3], 0x2c
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_buffer_load_dword s6, s[4:7], 0x0
-; GCN-NEXT:    s_load_dword s7, s[2:3], 0x2c
+; GCN-NEXT:    s_buffer_load_dword s4, s[4:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s2, 1
+; GCN-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[2:3]
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s6, 56
-; GCN-NEXT:    s_cselect_b32 s3, 1, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_cmp_lg_u32 s4, 56
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_cmp_lg_u32 s3, 0
+; GCN-NEXT:    s_mov_b32 s2, 1
 ; GCN-NEXT:    s_cbranch_scc0 .LBB4_2
 ; GCN-NEXT:  ; %bb.1: ; %.one
 ; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v1
@@ -145,15 +137,16 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
 ; GCN-LABEL: set_inactive_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -174,13 +167,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x4010cccc
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -193,15 +188,16 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
 ; GCN-LABEL: set_inactive_v2i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -216,15 +212,16 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
 define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; GCN-LABEL: set_inactive_v2f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -240,19 +237,15 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s6, 1
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 1, v3, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -267,19 +260,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s6, 1.0
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -293,15 +282,16 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v2bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -318,21 +308,20 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s6, 0x10001
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
   %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
@@ -345,21 +334,20 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s6, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
   %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
@@ -372,21 +360,20 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s6, 0x3f803f80
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
   %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
@@ -398,17 +385,15 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 ; GCN-LABEL: set_inactive_p0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -422,15 +407,13 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
 ; GCN-LABEL: set_inactive_p2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -445,15 +428,13 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
 ; GCN-LABEL: set_inactive_p3:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -468,15 +449,13 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
 ; GCN-LABEL: set_inactive_p5:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -491,15 +470,13 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
 ; GCN-LABEL: set_inactive_p6:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 5a8df7b8..b17dfc7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2890,68 +2890,65 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-LABEL: add_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
 ; GFX8_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[4:5]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[4:5]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s7, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s6, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s6, v4, 63
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB5_2
@@ -2970,8 +2967,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    v_add_u32_e32 v7, vcc, s5, v7
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2984,68 +2981,65 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-LABEL: add_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
 ; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s3, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s2, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s3, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s2, v4, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB5_2
@@ -3064,8 +3058,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v8
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9_DPP-NEXT:    v_add_co_u32_e32 v7, vcc, s1, v7
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3077,23 +3071,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1064_DPP-LABEL: add_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
@@ -3184,23 +3176,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1032_DPP-LABEL: add_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s0
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s0
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
@@ -3275,31 +3265,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -3388,53 +3377,53 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v4, 15
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s8, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s7, 16
@@ -3478,31 +3467,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1264_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1264_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -3594,29 +3582,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1232_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -6611,68 +6600,65 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-LABEL: sub_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
 ; GFX8_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[4:5]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[4:5]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s7, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s6, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s6, v4, 63
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB11_2
@@ -6691,8 +6677,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    v_sub_u32_e32 v7, vcc, s5, v7
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6705,68 +6691,65 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-LABEL: sub_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
 ; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s3, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s2, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s3, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s2, v4, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB11_2
@@ -6785,8 +6768,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v8
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9_DPP-NEXT:    v_sub_co_u32_e32 v7, vcc, s1, v7
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6798,23 +6781,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1064_DPP-LABEL: sub_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
@@ -6905,23 +6886,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1032_DPP-LABEL: sub_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s0
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s0
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
@@ -6996,31 +6975,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7109,53 +7087,53 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v4, 15
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s8, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s7, 16
@@ -7199,31 +7177,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1264_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1264_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7315,29 +7292,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1232_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 6bf03a2..988bc8e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2361,84 +2361,82 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: add_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v4, 63
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_add_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_add_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB6_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v9
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    v_add_u32_e32 v7, vcc, s5, v7
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
@@ -2450,83 +2448,81 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: add_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v4, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_add_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_add_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB6_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v9
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_DPP-NEXT:    v_add_co_u32_e32 v7, vcc, s5, v7
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
@@ -2538,23 +2534,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: add_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
@@ -2605,57 +2599,56 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s9, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s8, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_add_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_add_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB6_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_add_co_u32 v9, vcc, s0, v9
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc, s1, v10, vcc
+; GFX1064_DPP-NEXT:    v_add_co_u32 v9, vcc, s0, v11
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc, s1, v12, vcc
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: add_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
@@ -2685,33 +2678,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_add_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_add_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB6_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s0, v9
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo
+; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s0, v11
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
@@ -2722,31 +2716,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -2790,34 +2783,35 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s8, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s0
-; GFX1164_DPP-NEXT:    ds_add_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s0
+; GFX1164_DPP-NEXT:    ds_add_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB6_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v2
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, v2
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v9
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32 v8, vcc, s0, v8
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc, s1, v9, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32 v8, vcc, s0, v10
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc, s1, v11, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1164_DPP-NEXT:    s_nop 0
@@ -2830,77 +2824,77 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s4
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s4
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s4
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX1132_DPP-NEXT:    ds_add_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
+; GFX1132_DPP-NEXT:    ds_add_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB6_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v2
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v2
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32 v8, vcc_lo, s0, v8
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32 v8, vcc_lo, s0, v10
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1132_DPP-NEXT:    s_nop 0
@@ -3163,164 +3157,158 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ;
 ; GFX8_DPP-LABEL: add_i64_varying_nouse:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v8
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s3, v2, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s3, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s2, v2, 63
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX8_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_add_u64 v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_add_u64 v7, v[8:9]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB7_2:
 ; GFX8_DPP-NEXT:    s_endpgm
 ;
 ; GFX9_DPP-LABEL: add_i64_varying_nouse:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v8
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v1, vcc, v3, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v2, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s3, v2, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s3, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX9_DPP-NEXT:    ds_add_u64 v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9_DPP-NEXT:    ds_add_u64 v7, v[8:9]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB7_2:
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: add_i64_varying_nouse:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
@@ -3345,16 +3333,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 32
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_add_u32 s0, s3, s4
 ; GFX1064_DPP-NEXT:    s_addc_u32 s1, s2, s5
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_add_u64 v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_add_u64 v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB7_2:
@@ -3362,25 +3351,21 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ;
 ; GFX1032_DPP-LABEL: add_i64_varying_nouse:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s0
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
@@ -3398,14 +3383,15 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v2
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v11, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v11
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    ds_add_u64 v10, v[11:12]
+; GFX1032_DPP-NEXT:    ds_add_u64 v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB7_2:
@@ -3417,30 +3403,29 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v6, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
@@ -3460,15 +3445,16 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_DPP-NEXT:    v_add_co_u32 v2, vcc, v2, v3
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    ds_add_u64 v7, v[8:9]
+; GFX1164_DPP-NEXT:    ds_add_u64 v0, v[6:7]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB7_2:
@@ -3480,49 +3466,49 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v6, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s0
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v3, 0, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v1, 0, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_co_u32 v2, vcc_lo, v3, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v7, v3
 ; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    ds_add_u64 v7, v[8:9]
+; GFX1132_DPP-NEXT:    ds_add_u64 v0, v[6:7]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB7_2:
@@ -5909,84 +5895,82 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: sub_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v4, 63
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_sub_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_sub_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB14_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v9
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    v_sub_u32_e32 v7, vcc, s5, v7
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
@@ -5998,83 +5982,81 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: sub_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v4, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_sub_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_sub_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB14_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v9
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_DPP-NEXT:    v_sub_co_u32_e32 v7, vcc, s5, v7
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
@@ -6086,23 +6068,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: sub_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
@@ -6153,57 +6133,56 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s9, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s8, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_sub_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_sub_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB14_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_sub_co_u32 v9, vcc, s0, v9
-; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc, s1, v10, vcc
+; GFX1064_DPP-NEXT:    v_sub_co_u32 v9, vcc, s0, v11
+; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc, s1, v12, vcc
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: sub_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
@@ -6233,33 +6212,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_sub_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_sub_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB14_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s0, v9
-; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo
+; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s0, v11
+; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
@@ -6270,31 +6250,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -6338,34 +6317,35 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s8, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s0
-; GFX1164_DPP-NEXT:    ds_sub_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s0
+; GFX1164_DPP-NEXT:    ds_sub_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB14_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v2
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, v2
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v9
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_sub_co_u32 v8, vcc, s0, v8
-; GFX1164_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc, s1, v9, vcc
+; GFX1164_DPP-NEXT:    v_sub_co_u32 v8, vcc, s0, v10
+; GFX1164_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc, s1, v11, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1164_DPP-NEXT:    s_nop 0
@@ -6378,77 +6358,77 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s4
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s4
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s4
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX1132_DPP-NEXT:    ds_sub_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
+; GFX1132_DPP-NEXT:    ds_sub_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB14_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v2
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v2
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_sub_co_u32 v8, vcc_lo, s0, v8
-; GFX1132_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo
+; GFX1132_DPP-NEXT:    v_sub_co_u32 v8, vcc_lo, s0, v10
+; GFX1132_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1132_DPP-NEXT:    s_nop 0
@@ -7432,129 +7412,121 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: and_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v5, s[0:1]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT:    ds_and_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB16_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX8_DPP-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX8_DPP-NEXT:    v_and_b32_e32 v5, s5, v5
+; GFX8_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8_DPP-NEXT:    v_and_b32_e32 v6, s0, v6
+; GFX8_DPP-NEXT:    v_and_b32_e32 v5, s1, v5
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[4:7], 0
 ; GFX8_DPP-NEXT:    s_endpgm
 ;
 ; GFX9_DPP-LABEL: and_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v5, s[0:1]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX9_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9_DPP-NEXT:    ds_and_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB16_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX9_DPP-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX9_DPP-NEXT:    v_and_b32_e32 v5, s5, v5
+; GFX9_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9_DPP-NEXT:    v_and_b32_e32 v6, s0, v6
+; GFX9_DPP-NEXT:    v_and_b32_e32 v5, s1, v5
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[4:7], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: and_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[0:1]
-; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7575,63 +7547,62 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v2, 31
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1064_DPP-NEXT:    ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1064_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB16_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_and_b32_e32 v8, s0, v8
-; GFX1064_DPP-NEXT:    v_and_b32_e32 v7, s1, v7
+; GFX1064_DPP-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX1064_DPP-NEXT:    v_and_b32_e32 v8, s1, v8
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: and_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s4
-; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7643,199 +7614,201 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1032_DPP-NEXT:    ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1032_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB16_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_and_b32_e32 v8, s0, v8
-; GFX1032_DPP-NEXT:    v_and_b32_e32 v7, s1, v7
+; GFX1032_DPP-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX1032_DPP-NEXT:    v_and_b32_e32 v8, s1, v8
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: and_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 31
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v2, 31
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1164_DPP-NEXT:    ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB16_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, s0, v8
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, s1, v7
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, s1, v8
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1164_DPP-NEXT:    s_nop 0
 ; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: and_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s4
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s4
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s4
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
-; GFX1132_DPP-NEXT:    ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB16_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_and_b32_e32 v8, s0, v8
-; GFX1132_DPP-NEXT:    v_and_b32_e32 v7, s1, v7
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v8, s1, v8
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1132_DPP-NEXT:    s_nop 0
 ; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
@@ -8816,54 +8789,52 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: or_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT:    ds_or_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB18_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX8_DPP-NEXT:    v_or_b32_e32 v6, s4, v6
@@ -8874,53 +8845,51 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: or_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX9_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9_DPP-NEXT:    ds_or_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB18_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX9_DPP-NEXT:    v_or_b32_e32 v6, s4, v6
@@ -8931,14 +8900,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: or_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8959,63 +8926,62 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v2, 31
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1064_DPP-NEXT:    ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1064_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB18_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_or_b32_e32 v8, s0, v8
-; GFX1064_DPP-NEXT:    v_or_b32_e32 v7, s1, v7
+; GFX1064_DPP-NEXT:    v_or_b32_e32 v9, s0, v9
+; GFX1064_DPP-NEXT:    v_or_b32_e32 v8, s1, v8
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: or_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9027,199 +8993,201 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1032_DPP-NEXT:    ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1032_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB18_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_or_b32_e32 v8, s0, v8
-; GFX1032_DPP-NEXT:    v_or_b32_e32 v7, s1, v7
+; GFX1032_DPP-NEXT:    v_or_b32_e32 v9, s0, v9
+; GFX1032_DPP-NEXT:    v_or_b32_e32 v8, s1, v8
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: or_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 31
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v2, 31
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1164_DPP-NEXT:    ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB18_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_or_b32_e32 v8, s0, v8
-; GFX1164_DPP-NEXT:    v_or_b32_e32 v7, s1, v7
+; GFX1164_DPP-NEXT:    v_or_b32_e32 v9, s0, v9
+; GFX1164_DPP-NEXT:    v_or_b32_e32 v8, s1, v8
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1164_DPP-NEXT:    s_nop 0
 ; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: or_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
-; GFX1132_DPP-NEXT:    ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB18_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_or_b32_e32 v8, s0, v8
-; GFX1132_DPP-NEXT:    v_or_b32_e32 v7, s1, v7
+; GFX1132_DPP-NEXT:    v_or_b32_e32 v9, s0, v9
+; GFX1132_DPP-NEXT:    v_or_b32_e32 v8, s1, v8
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1132_DPP-NEXT:    s_nop 0
 ; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
@@ -10200,54 +10168,52 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: xor_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT:    ds_xor_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB20_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX8_DPP-NEXT:    v_xor_b32_e32 v6, s4, v6
@@ -10258,53 +10224,51 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: xor_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX9_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9_DPP-NEXT:    ds_xor_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB20_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX9_DPP-NEXT:    v_xor_b32_e32 v6, s4, v6
@@ -10315,14 +10279,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: xor_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10343,63 +10305,62 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v2, 31
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1064_DPP-NEXT:    ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1064_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB20_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_xor_b32_e32 v8, s0, v8
-; GFX1064_DPP-NEXT:    v_xor_b32_e32 v7, s1, v7
+; GFX1064_DPP-NEXT:    v_xor_b32_e32 v9, s0, v9
+; GFX1064_DPP-NEXT:    v_xor_b32_e32 v8, s1, v8
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: xor_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10411,199 +10372,201 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1032_DPP-NEXT:    ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1032_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB20_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_xor_b32_e32 v8, s0, v8
-; GFX1032_DPP-NEXT:    v_xor_b32_e32 v7, s1, v7
+; GFX1032_DPP-NEXT:    v_xor_b32_e32 v9, s0, v9
+; GFX1032_DPP-NEXT:    v_xor_b32_e32 v8, s1, v8
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: xor_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 31
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v2, 31
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1164_DPP-NEXT:    ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB20_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_xor_b32_e32 v8, s0, v8
-; GFX1164_DPP-NEXT:    v_xor_b32_e32 v7, s1, v7
+; GFX1164_DPP-NEXT:    v_xor_b32_e32 v9, s0, v9
+; GFX1164_DPP-NEXT:    v_xor_b32_e32 v8, s1, v8
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1164_DPP-NEXT:    s_nop 0
 ; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: xor_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
-; GFX1132_DPP-NEXT:    ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB20_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_xor_b32_e32 v8, s0, v8
-; GFX1132_DPP-NEXT:    v_xor_b32_e32 v7, s1, v7
+; GFX1132_DPP-NEXT:    v_xor_b32_e32 v9, s0, v9
+; GFX1132_DPP-NEXT:    v_xor_b32_e32 v8, s1, v8
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1132_DPP-NEXT:    s_nop 0
 ; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
@@ -10934,11 +10897,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
-; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -10982,11 +10942,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
-; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -11885,20 +11842,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-LABEL: max_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
 ; GFX8_DPP-NEXT:    s_mov_b32 s0, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX8_DPP-NEXT:    s_brev_b32 s1, 1
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, s1
+; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
@@ -11953,22 +11906,22 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8_DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB23_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v7
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX8_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
@@ -11984,20 +11937,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-LABEL: max_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
 ; GFX9_DPP-NEXT:    s_mov_b32 s0, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9_DPP-NEXT:    s_brev_b32 s1, 1
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -12052,21 +12001,21 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9_DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB23_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
@@ -12081,15 +12030,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: max_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1064_DPP-NEXT:    s_brev_b32 s5, 1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s5
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, s4, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, s5, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -12156,30 +12103,31 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB23_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[9:10]
@@ -12191,15 +12139,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: max_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 s0, 0
 ; GFX1032_DPP-NEXT:    s_brev_b32 s1, 1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, 0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, s0, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, s1, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -12244,29 +12190,30 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v4, 15
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB23_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[9:10]
@@ -12278,79 +12225,79 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164_DPP-LABEL: max_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1164_DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1164_DPP-NEXT:    s_brev_b32 s1, 1
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, s0, v9, s[4:5]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, s1, v10, s[4:5]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1164_DPP-NEXT:    s_brev_b32 s5, 1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_readlane_b32 s0, v4, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s1, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -12369,29 +12316,30 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT:    ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB23_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12406,55 +12354,54 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132_DPP-LABEL: max_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    s_mov_b32 s0, 0
 ; GFX1132_DPP-NEXT:    s_brev_b32 s1, 1
-; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, s0, v9, s4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, s1, v10, s4
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, 0, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -12462,27 +12409,28 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT:    ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB23_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12821,11 +12769,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
-; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -12869,11 +12814,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
-; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -13771,21 +13713,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: min_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX8_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
@@ -13839,22 +13777,22 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB26_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s0, v7
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s1, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX8_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s1
@@ -13868,21 +13806,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: min_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX9_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -13936,21 +13870,21 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB26_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s1
@@ -13964,15 +13898,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: min_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, s6, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, s7, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -14039,29 +13971,30 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s7, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s8, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s10, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s11, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB26_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[9:10]
@@ -14073,15 +14006,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: min_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, s6, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, s7, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fffffff, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -14126,28 +14057,29 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s7, v3, 15
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s7, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB26_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[9:10]
@@ -14159,79 +14091,79 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164_DPP-LABEL: min_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1164_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, s6, v9, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, s7, v10, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -14250,29 +14182,29 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s8, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s10, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s11, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT:    ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB26_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -14287,82 +14219,82 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132_DPP-LABEL: min_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_brev_b32 s7, -2
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132_DPP-NEXT:    s_brev_b32 s7, -2
+; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, s6, v9, s4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, s7, v10, s4
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fffffff, 0, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s4
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v3, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s7, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT:    ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB26_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -15634,85 +15566,84 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: umax_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v2, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_max_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB29_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
@@ -15726,84 +15657,83 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: umax_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v2, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_max_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB29_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
@@ -15817,18 +15747,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: umax_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -15892,30 +15818,31 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB29_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[9:10]
@@ -15927,13 +15854,11 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: umax_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
@@ -15978,29 +15903,30 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v4, 15
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB29_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[9:10]
@@ -16016,73 +15942,73 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16101,29 +16027,30 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT:    ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB29_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -16142,49 +16069,49 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, 0, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16192,27 +16119,28 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT:    ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB29_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -17485,201 +17413,193 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: umin_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v7, s[0:1]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v8, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v2, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_min_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB32_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s0, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX8_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8_DPP-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
 ; GFX8_DPP-NEXT:    s_endpgm
 ;
 ; GFX9_DPP-LABEL: umin_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v2, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_min_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB32_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: umin_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -17743,30 +17663,31 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB32_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[9:10]
@@ -17778,13 +17699,11 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: umin_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
@@ -17829,29 +17748,30 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v4, 15
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB32_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[9:10]
@@ -17867,73 +17787,73 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, -1, v9, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, -1, v10, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -17952,29 +17872,30 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT:    ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB32_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -17993,49 +17914,49 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, -1, v9, s4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, -1, v10, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, 0, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s4
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -18043,27 +17964,28 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT:    ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB32_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index 6bd0b11..d62ff37 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -59,7 +59,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX908-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
   ; GFX908-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX908-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec
   ; GFX908-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX908-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX908-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -120,7 +121,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_GFX940-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX90A_GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX90A_GFX940-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX90A_GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -177,7 +179,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11_GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX11_GFX12-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX11_GFX12-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GFX11_GFX12-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 354, 15, 15, 0, implicit $exec
@@ -186,8 +189,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
-  ; GFX11_GFX12-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; GFX11_GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+  ; GFX11_GFX12-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11_GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
   ; GFX11_GFX12-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], [[COPY5]], 0, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; GFX11_GFX12-NEXT:   early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index 6766c0c..946ee9e 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -61,7 +61,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
   ; GFX90A-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX90A-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec
   ; GFX90A-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX90A-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX90A-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -80,7 +81,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT:   early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
   ; GFX90A-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
   ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
-  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GFX90A-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.2
   ; GFX90A-NEXT: {{  $}}
@@ -102,11 +103,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT: bb.4 (%ir-block.35):
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
+  ; GFX90A-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
   ; GFX90A-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
-  ; GFX90A-NEXT:   early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
-  ; GFX90A-NEXT:   [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec
+  ; GFX90A-NEXT:   early-clobber %46:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
+  ; GFX90A-NEXT:   [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec
   ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
   ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
   ; GFX90A-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
@@ -142,7 +143,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
   ; GFX940-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX940-NEXT:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec
   ; GFX940-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX940-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX940-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -161,7 +163,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT:   early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
   ; GFX940-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
   ; GFX940-NEXT:   [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
-  ; GFX940-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX940-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX940-NEXT:   S_BRANCH %bb.2
   ; GFX940-NEXT: {{  $}}
@@ -183,11 +185,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT: bb.4 (%ir-block.35):
   ; GFX940-NEXT:   successors: %bb.3(0x80000000)
   ; GFX940-NEXT: {{  $}}
-  ; GFX940-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
+  ; GFX940-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
   ; GFX940-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX940-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
-  ; GFX940-NEXT:   early-clobber %44:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
-  ; GFX940-NEXT:   [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
+  ; GFX940-NEXT:   early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
+  ; GFX940-NEXT:   [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec
   ; GFX940-NEXT:   [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
   ; GFX940-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
   ; GFX940-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
@@ -219,7 +221,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX11-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -229,8 +232,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
+  ; GFX11-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[COPY5]], 0, implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
@@ -244,7 +247,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec
   ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
   ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
-  ; GFX11-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.2
   ; GFX11-NEXT: {{  $}}
@@ -266,11 +269,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT: bb.4 (%ir-block.32):
   ; GFX11-NEXT:   successors: %bb.3(0x80000000)
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
+  ; GFX11-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF3]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
   ; GFX11-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
-  ; GFX11-NEXT:   early-clobber %46:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
-  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec
+  ; GFX11-NEXT:   early-clobber %47:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
+  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %47, 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY6]]
   ; GFX11-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY8]], implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 311c609..2b18f47 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -816,10 +816,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2027,10 +2024,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3298,10 +3292,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4065,10 +4056,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5361,10 +5349,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -7330,44 +7315,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7463,9 +7449,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7585,9 +7571,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7691,12 +7677,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -7813,42 +7799,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -8907,44 +8893,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -9008,9 +8995,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9096,9 +9083,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9168,12 +9155,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9259,42 +9246,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -10330,44 +10317,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -10431,9 +10419,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10519,9 +10507,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10591,12 +10579,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -10682,42 +10670,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB14_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -11235,44 +11223,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -11336,9 +11325,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11424,9 +11413,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11496,12 +11485,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11587,42 +11576,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB15_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -13305,44 +13294,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -13438,9 +13428,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13560,9 +13550,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13666,12 +13656,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -13788,42 +13778,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 9dc82b1..e3144ae 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -718,10 +718,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1756,10 +1753,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2794,10 +2788,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4742,51 +4733,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -4884,9 +4876,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5016,9 +5008,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5130,12 +5122,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5264,51 +5256,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -6188,51 +6179,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -6298,9 +6290,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6381,9 +6373,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6446,12 +6438,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6550,51 +6542,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -8238,51 +8229,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -8380,9 +8372,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8512,9 +8504,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8626,12 +8618,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8760,51 +8752,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 945583c..ddc1031 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -718,10 +718,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1756,10 +1753,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2794,10 +2788,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4742,51 +4733,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -4884,9 +4876,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5016,9 +5008,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5130,12 +5122,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5264,51 +5256,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -6188,51 +6179,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
-; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -6298,9 +6290,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6381,9 +6373,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6446,12 +6438,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6550,51 +6542,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -8238,51 +8229,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -8380,9 +8372,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8512,9 +8504,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8626,12 +8618,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8760,51 +8752,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 3bc0f25..f353edf 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -894,10 +894,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2217,10 +2214,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3540,10 +3534,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4359,10 +4350,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5681,10 +5669,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -7650,44 +7635,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7783,9 +7769,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7905,9 +7891,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8011,12 +7997,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -8133,42 +8119,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -9226,44 +9212,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -9327,9 +9314,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9415,9 +9402,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9487,12 +9474,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9578,42 +9565,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -10649,44 +10636,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -10750,9 +10738,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10838,9 +10826,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10910,12 +10898,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11001,42 +10989,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB14_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -11554,44 +11542,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -11655,9 +11644,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11743,9 +11732,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11815,12 +11804,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11906,42 +11895,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB15_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -13623,44 +13612,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -13756,9 +13746,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13878,9 +13868,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13984,12 +13974,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -14106,42 +14096,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index c1b58f1..fbf8c203 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -93,18 +93,18 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL11:       ; %bb.0:
 ; DAGISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
-; DAGISEL11-NEXT:    v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10
+; DAGISEL11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v10
 ; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
 ; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
 ; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; DAGISEL11-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s0
-; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; DAGISEL11-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL11-NEXT:    v_cndmask_b32_e64 v2, v0, v13, s0
+; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s0
 ; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
-; DAGISEL11-NEXT:    v_mov_b32_e32 v2, v0
-; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v3, v1
-; DAGISEL11-NEXT:    global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; DAGISEL11-NEXT:    v_mov_b32_e32 v4, v2
+; DAGISEL11-NEXT:    global_store_b64 v[8:9], v[3:4], off
 ; DAGISEL11-NEXT:    s_endpgm
 ;
 ; GISEL10-LABEL: set_inactive_chain_arg_64:
@@ -127,16 +127,16 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL10:       ; %bb.0:
 ; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
-; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v11
-; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v11
+; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v10
 ; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
 ; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
-; DAGISEL10-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s0
-; DAGISEL10-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL10-NEXT:    v_cndmask_b32_e64 v2, v0, v13, s0
+; DAGISEL10-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s0
 ; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
-; DAGISEL10-NEXT:    v_mov_b32_e32 v2, v0
 ; DAGISEL10-NEXT:    v_mov_b32_e32 v3, v1
-; DAGISEL10-NEXT:    global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10-NEXT:    v_mov_b32_e32 v4, v2
+; DAGISEL10-NEXT:    global_store_dwordx2 v[8:9], v[3:4], off
 ; DAGISEL10-NEXT:    s_endpgm
 ;
 ; GISEL11_W64-LABEL: set_inactive_chain_arg_64:
@@ -162,18 +162,19 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL11_W64:       ; %bb.0:
 ; DAGISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v11
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v11
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v10
 ; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
-; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v2, v0, v13, s[0:1]
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[0:1]
 ; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
-; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v2, v0
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v3, v1
-; DAGISEL11_W64-NEXT:    global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v4, v2
+; DAGISEL11_W64-NEXT:    global_store_b64 v[8:9], v[3:4], off
 ; DAGISEL11_W64-NEXT:    s_endpgm
 ;
 ; GISEL10_W64-LABEL: set_inactive_chain_arg_64:
@@ -196,16 +197,16 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL10_W64:       ; %bb.0:
 ; DAGISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v11
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v11
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v10
 ; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
-; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v2, v0, v13, s[0:1]
+; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[0:1]
 ; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v2, v0
 ; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v3, v1
-; DAGISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v4, v2
+; DAGISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[3:4], off
 ; DAGISEL10_W64-NEXT:    s_endpgm
   %tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0
   %wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 6dc4a2c..6fb5a9c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -5,16 +5,14 @@
 define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: set_inactive:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -46,19 +44,19 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
 ; GCN-LABEL: set_inactive_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -96,11 +94,9 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_buffer_load_dword s4, s[4:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, s8
 ; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_mov_b64 exec, s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[2:3]
 ; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 56
@@ -145,17 +141,15 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
 ; GCN-LABEL: set_inactive_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s7, 0x40400000
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -170,21 +164,21 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-LABEL: set_inactive_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mov_b32 s0, 0xcccccccd
-; GCN-NEXT:    s_mov_b32 s1, 0x4010cccc
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x4010cccc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0xcccccccd
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -198,17 +192,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
 ; GCN-LABEL: set_inactive_v2i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s7, 0x10001
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -222,17 +214,15 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
 define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; GCN-LABEL: set_inactive_v2f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s7, 0x3c003c00
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -247,21 +237,19 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    s_mov_b32 s10, 1
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 1, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -276,21 +264,19 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    s_mov_b32 s10, 1.0
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -304,17 +290,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v2bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s7, 0x3f803f80
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -329,21 +313,20 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    s_mov_b32 s10, 0x10001
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -358,21 +341,20 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 ; GCN-LABEL: set_inactive_v4f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    s_mov_b32 s10, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -387,21 +369,20 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    s_mov_b32 s10, 0x3f803f80
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -416,19 +397,19 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 ; GCN-LABEL: set_inactive_p0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -442,16 +423,14 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
 ; GCN-LABEL: set_inactive_p2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -465,16 +444,14 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
 ; GCN-LABEL: set_inactive_p3:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -488,16 +465,14 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
 ; GCN-LABEL: set_inactive_p5:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -511,16 +486,14 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
 ; GCN-LABEL: set_inactive_p6:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 8a5f753..b0fb24e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -111,7 +111,7 @@ body:             |
   ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr5
   ; GCN-NEXT:   $sgpr22 = IMPLICIT_DEF
   ; GCN-NEXT:   renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr2
-  ; GCN-NEXT:   dead $vgpr1 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc
+  ; GCN-NEXT:   dead $vgpr1 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
@@ -212,7 +212,7 @@ body:             |
 
     $sgpr22 = IMPLICIT_DEF
     SI_SPILL_S32_SAVE $sgpr22, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5)
-    %0:vgpr_32 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc
+    %0:vgpr_32 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc
 
   bb.1:
     KILL implicit-def $vcc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index ff692ac..92117e0 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1708,8 +1708,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in)
 ; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 0, s6, s0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s7, s0
+; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 0, s6, s0
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
@@ -1722,8 +1722,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in)
 ; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 0, s6, s[0:1]
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s7, s[0:1]
+; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 0, s6, s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index 64a7c445..3013aab 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -40,9 +40,6 @@
   define amdgpu_vs void @no_wqm_in_vs() {
     ret void
   }
-  define amdgpu_ps void @preloaded_set_inactive() {
-    ret void
-  }
 ...
 ---
 
@@ -155,7 +152,7 @@ registers:
   - { id: 9, class: sreg_32, preferred-register: '' }
   - { id: 10, class: vgpr_32, preferred-register: '' }
   - { id: 11, class: vgpr_32, preferred-register: '' }
-  - { id: 12, class: sreg_32, preferred-register: '' }
+  - { id: 12, class: vgpr_32, preferred-register: '' }
   - { id: 13, class: vgpr_32, preferred-register: '' }
   - { id: 14, class: vgpr_32, preferred-register: '' }
   - { id: 15, class: vgpr_32, preferred-register: '' }
@@ -179,7 +176,8 @@ body:             |
     %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, implicit $exec
     %16:vgpr_32 = COPY %8.sub1
     %11:vgpr_32 = COPY %16
-    %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec, implicit-def $scc
+    %17:sreg_64_xexec = IMPLICIT_DEF
+    %10:vgpr_32 = V_SET_INACTIVE_B32 0, %11, 0, undef %12, undef %17, implicit $exec, implicit-def $scc
     %14:vgpr_32 = COPY %7
     %13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec
     early-clobber %15:vgpr_32 = STRICT_WWM killed %13, implicit $exec
@@ -298,8 +296,9 @@ body:             |
     %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
     %1:vgpr_32 = COPY $vgpr0
     %2:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN %1:vgpr_32, %0:sgpr_128, 0, 0, 0, 0, implicit $exec
-    %2.sub0:vreg_64 = V_SET_INACTIVE_B32 %2.sub0:vreg_64, 0, implicit $exec, implicit-def $scc
-    %2.sub1:vreg_64 = V_SET_INACTIVE_B32 %2.sub1:vreg_64, 0, implicit $exec, implicit-def $scc
+    %4:sreg_64_xexec = IMPLICIT_DEF
+    %2.sub0:vreg_64 = V_SET_INACTIVE_B32 0, %2.sub0:vreg_64, 0, 0, undef %4, implicit $exec, implicit-def $scc
+    %2.sub1:vreg_64 = V_SET_INACTIVE_B32 0, %2.sub1:vreg_64, 0, 0, undef %4, implicit $exec, implicit-def $scc
     %3:vreg_64 = nnan nsz arcp contract reassoc nofpexcept V_MAX_F64_e64 0, %2:vreg_64, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = STRICT_WWM %3.sub0:vreg_64, implicit $exec
     $vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec
@@ -446,19 +445,3 @@ body:             |
 
     %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
 ...
-
----
-# Preserve V_SET_INACTIVE with exec mask already specified
-#CHECK-LABEL: name: preloaded_set_inactive
-#CHECK: V_SET_INACTIVE_B32
-name:            preloaded_set_inactive
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $vgpr1, $vgpr2
-
-    %0:vgpr_32 = COPY $vgpr1
-    %1:vgpr_32 = COPY $vgpr2
-    %mask:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    %value:vgpr_32 = V_SET_INACTIVE_B32 %0:vgpr_32, %1:vgpr_32, implicit $exec, implicit-def $scc, implicit %mask:sreg_64
-...
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 47e1897..b35ef64 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -30,15 +30,15 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40_sgpr41
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40_sgpr41
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -170,11 +170,10 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -201,6 +200,9 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -352,32 +354,32 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
 ; GFX9-O0-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
-; GFX9-O0-NEXT:    s_mov_b32 s42, s6
+; GFX9-O0-NEXT:    s_mov_b32 s40, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s4
-; GFX9-O0-NEXT:    ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
-; GFX9-O0-NEXT:    s_mov_b32 s43, s7
-; GFX9-O0-NEXT:    s_mov_b32 s44, s43
-; GFX9-O0-NEXT:    s_mov_b32 s45, s42
+; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
+; GFX9-O0-NEXT:    s_mov_b32 s41, s7
+; GFX9-O0-NEXT:    s_mov_b32 s42, s41
+; GFX9-O0-NEXT:    s_mov_b32 s43, s40
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s5
-; GFX9-O0-NEXT:    s_mov_b32 s46, s35
+; GFX9-O0-NEXT:    s_mov_b32 s44, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s34
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
-; GFX9-O0-NEXT:    s_mov_b32 s37, s46
-; GFX9-O0-NEXT:    s_mov_b32 s38, s45
-; GFX9-O0-NEXT:    s_mov_b32 s39, s44
+; GFX9-O0-NEXT:    s_mov_b32 s37, s44
+; GFX9-O0-NEXT:    s_mov_b32 s38, s43
+; GFX9-O0-NEXT:    s_mov_b32 s39, s42
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40_sgpr41
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[40:41]
 ; GFX9-O0-NEXT:    s_getpc_b64 s[42:43]
 ; GFX9-O0-NEXT:    s_add_u32 s42, s42, strict_wwm_called@rel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s43, s43, strict_wwm_called@rel32@hi+12
@@ -396,8 +398,8 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s48
@@ -417,11 +419,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-O3-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4
@@ -539,20 +539,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s50, s33
+; GFX9-O0-NEXT:    s_mov_b32 s48, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
@@ -560,48 +560,59 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x1000
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[38:39], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s38, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s39, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s8
-; GFX9-O0-NEXT:    s_mov_b32 s40, s6
+; GFX9-O0-NEXT:    s_mov_b32 s38, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
-; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT:    s_mov_b32 s41, s7
-; GFX9-O0-NEXT:    s_mov_b32 s35, s41
-; GFX9-O0-NEXT:    s_mov_b32 s42, s40
+; GFX9-O0-NEXT:    ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39
+; GFX9-O0-NEXT:    s_mov_b32 s39, s7
+; GFX9-O0-NEXT:    s_mov_b32 s35, s39
+; GFX9-O0-NEXT:    s_mov_b32 s44, s38
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37
 ; GFX9-O0-NEXT:    s_mov_b32 s37, s5
-; GFX9-O0-NEXT:    s_mov_b32 s43, s37
-; GFX9-O0-NEXT:    s_mov_b32 s44, s36
-; GFX9-O0-NEXT:    ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47
-; GFX9-O0-NEXT:    s_mov_b32 s45, s43
-; GFX9-O0-NEXT:    s_mov_b32 s46, s42
-; GFX9-O0-NEXT:    s_mov_b32 s47, s35
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s44, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s45, 3
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s46, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s47, 5
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
+; GFX9-O0-NEXT:    s_mov_b32 s45, s37
+; GFX9-O0-NEXT:    s_mov_b32 s40, s36
+; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43
+; GFX9-O0-NEXT:    s_mov_b32 s41, s45
+; GFX9-O0-NEXT:    s_mov_b32 s42, s44
+; GFX9-O0-NEXT:    s_mov_b32 s43, s35
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s40, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s41, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s42, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s43, 3
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
+; GFX9-O0-NEXT:    s_mov_b32 s38, s35
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s37
+; GFX9-O0-NEXT:    s_mov_b32 s40, s37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr42_sgpr43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s38
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[38:39], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s40
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[38:39]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[38:39]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s35
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 killed $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s34, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s35, 5
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 32
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s34, v[8:9]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s34, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O0-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-O0-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12
@@ -610,24 +621,24 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], s[0:1]
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 3
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
@@ -645,21 +656,21 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    ; kill: killed $vgpr0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff000
-; GFX9-O0-NEXT:    s_mov_b32 s33, s50
+; GFX9-O0-NEXT:    s_mov_b32 s33, s48
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -671,7 +682,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
@@ -682,25 +692,25 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    v_writelane_b32 v8, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-O3-NEXT:    v_writelane_b32 v8, s31, 1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O3-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-O3-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4
 ; GFX9-O3-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[36:37]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
@@ -735,8 +745,10 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -747,73 +759,82 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 5
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s34, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    buffer_load_dwordx4 v[10:13], v0, s[36:39], s34 offen
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16
+; GFX9-O0-NEXT:    buffer_load_dwordx4 v[11:14], v0, s[36:39], s34 offen
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s35, 0x7fffffff
-; GFX9-O0-NEXT:    s_mov_b32 s42, -1
-; GFX9-O0-NEXT:    ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
-; GFX9-O0-NEXT:    s_mov_b32 s43, s35
+; GFX9-O0-NEXT:    s_mov_b32 s44, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45
+; GFX9-O0-NEXT:    s_mov_b32 s45, s35
+; GFX9-O0-NEXT:    s_mov_b32 s42, s45
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr46_sgpr47
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s42
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s43
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[40:41]
+; GFX9-O0-NEXT:    s_mov_b32 s35, s44
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr44_sgpr45
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s35
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[40:41]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr43
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr44_sgpr45
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s42
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s43
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[40:41]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr44_sgpr45
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s35
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[40:41]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr43
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr44_sgpr45
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s42
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s43
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr42_sgpr43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s35
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[40:41]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    buffer_store_dwordx4 v[5:8], v0, s[36:39], s34 offen
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT:    buffer_store_dwordx4 v[6:9], v0, s[36:39], s34 offen
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -823,50 +844,38 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
+; GFX9-O3-NEXT:    buffer_load_dwordx4 v[8:11], v0, s[4:7], 0 offen
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[12:13], v0, s[4:7], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    s_mov_b32 s36, -1
-; GFX9-O3-NEXT:    s_brev_b32 s37, -2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s36
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s37
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s36
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s37
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s36
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s37
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, v1, v9, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v5, v1, v11, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, v1, v13, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, -1, v12, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen
-; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
+; GFX9-O3-NEXT:    v_mov_b32_e32 v13, v7
+; GFX9-O3-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[4:7], 0 offen
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[12:13], v0, s[4:7], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -874,6 +883,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
@@ -908,9 +918,11 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -961,110 +973,113 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s19
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s25
-; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s28
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s29
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v39
-; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v38
-; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v37
-; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v36
-; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v40
+; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v39
+; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v38
+; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v37
+; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v36
+; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v46
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v45
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v44
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v43
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v42
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v41
-; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v40
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v46
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v45
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v44
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v42
+; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v41
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v40
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v39
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v37
+; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v38
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v37
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v35
-; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr34 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v36
+; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr35 killed $exec
 ; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -1104,63 +1119,100 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v11
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    s_mov_b64 s[38:39], 0
+; GFX9-O0-NEXT:    s_mov_b32 s37, s39
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40_sgpr41
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v32, v32, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v11
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v10
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    s_mov_b32 s36, s38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v32
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v32, v32, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v9
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v32
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v32, v32, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v7
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v32
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v32, v32, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v5
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v32
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v32, v32, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v32
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v10
@@ -1219,8 +1271,10 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1242,32 +1296,32 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O3-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:16
 ; GFX9-O3-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v1, s[34:35]
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v2, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v1, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v32
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v33
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v3, s[34:35]
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v4, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v3, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v32
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v33
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v5, s[34:35]
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v6, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v5, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v32
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v33
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v7, s[34:35]
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v8, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v7, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v32
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v33
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v9, s[34:35]
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v10, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v9, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v32
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v33
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 7f0db3e..7fecab0 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -26,15 +26,15 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -147,11 +147,10 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -178,6 +177,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -336,40 +338,40 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s3, s9
-; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
-; GFX9-O0-NEXT:    s_mov_b32 s9, s17
-; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; GFX9-O0-NEXT:    s_mov_b32 s3, s7
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_mov_b32 s7, s9
+; GFX9-O0-NEXT:    s_mov_b32 s16, s8
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s9
-; GFX9-O0-NEXT:    s_mov_b32 s18, s8
+; GFX9-O0-NEXT:    s_mov_b32 s17, s7
+; GFX9-O0-NEXT:    s_mov_b32 s18, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
+; GFX9-O0-NEXT:    s_mov_b32 s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s6, 8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -398,13 +400,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -434,15 +436,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dword s4, s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 56
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[20:21]
 ; GFX9-O3-NEXT:    s_addc_u32 s9, s3, 0
 ; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
@@ -587,41 +587,56 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s8, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s9, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s6, s19
-; GFX9-O0-NEXT:    s_mov_b32 s7, s18
-; GFX9-O0-NEXT:    s_mov_b32 s15, s17
+; GFX9-O0-NEXT:    s_mov_b32 s6, s9
+; GFX9-O0-NEXT:    s_mov_b32 s7, s8
+; GFX9-O0-NEXT:    s_mov_b32 s8, s17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s15
+; GFX9-O0-NEXT:    s_mov_b32 s17, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT:    s_mov_b32 s15, s7
+; GFX9-O0-NEXT:    s_mov_b32 s8, s3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr16_sgpr17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s15
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX9-O0-NEXT:    s_mov_b32 s2, 32
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s2, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -631,11 +646,6 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    s_mov_b32 s0, 32
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-O0-NEXT:    s_add_u32 s0, s0, called_i64@gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s1, s1, called_i64@gotpcrel32@hi+12
@@ -650,20 +660,20 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -701,14 +711,13 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 60
 ; GFX9-O3-NEXT:    s_addc_u32 s9, s3, 0
 ; GFX9-O3-NEXT:    s_getpc_b64 s[2:3]
@@ -717,6 +726,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[22:23], s[2:3], 0x0
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[20:21]
 ; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[24:25]
@@ -724,14 +734,14 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O3-NEXT:    s_mov_b32 s13, s7
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v31, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[26:27]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[22:23]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v7, vcc
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v4
@@ -763,109 +773,103 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 5
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT:    s_mov_b32 s8, -1
-; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
-; GFX9-O0-NEXT:    s_mov_b32 s9, s5
+; GFX9-O0-NEXT:    s_mov_b32 s10, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
+; GFX9-O0-NEXT:    s_mov_b32 s11, s5
+; GFX9-O0-NEXT:    s_mov_b32 s8, s11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT:    s_mov_b32 s5, s10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT:    buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: _amdgpu_cs_main:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-O3-NEXT:    s_mov_b32 s6, -1
-; GFX9-O3-NEXT:    s_brev_b32 s7, -2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, v1, v9, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v5, v1, v11, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, v1, v13, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, -1, v12, s[4:5]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    v_mov_b32_e32 v13, v7
+; GFX9-O3-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_endpgm
   %tmp17 = shl i32 %index, 5
   %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
@@ -915,15 +919,15 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1036,11 +1040,10 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1067,6 +1070,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1225,40 +1231,40 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s3, s9
-; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
-; GFX9-O0-NEXT:    s_mov_b32 s9, s17
-; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; GFX9-O0-NEXT:    s_mov_b32 s3, s7
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_mov_b32 s7, s9
+; GFX9-O0-NEXT:    s_mov_b32 s16, s8
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s9
-; GFX9-O0-NEXT:    s_mov_b32 s18, s8
+; GFX9-O0-NEXT:    s_mov_b32 s17, s7
+; GFX9-O0-NEXT:    s_mov_b32 s18, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
+; GFX9-O0-NEXT:    s_mov_b32 s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s6, 8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1287,13 +1293,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1323,15 +1329,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dword s4, s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 56
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[20:21]
 ; GFX9-O3-NEXT:    s_addc_u32 s9, s3, 0
 ; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
@@ -1476,41 +1480,56 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s8, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s9, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s6, s19
-; GFX9-O0-NEXT:    s_mov_b32 s7, s18
-; GFX9-O0-NEXT:    s_mov_b32 s15, s17
+; GFX9-O0-NEXT:    s_mov_b32 s6, s9
+; GFX9-O0-NEXT:    s_mov_b32 s7, s8
+; GFX9-O0-NEXT:    s_mov_b32 s8, s17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s15
+; GFX9-O0-NEXT:    s_mov_b32 s17, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT:    s_mov_b32 s15, s7
+; GFX9-O0-NEXT:    s_mov_b32 s8, s3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr16_sgpr17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s15
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX9-O0-NEXT:    s_mov_b32 s2, 32
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s2, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1520,11 +1539,6 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    s_mov_b32 s0, 32
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-O0-NEXT:    s_add_u32 s0, s0, strict_wwm_called_i64@gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s1, s1, strict_wwm_called_i64@gotpcrel32@hi+12
@@ -1539,20 +1553,20 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1590,14 +1604,13 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 60
 ; GFX9-O3-NEXT:    s_addc_u32 s9, s3, 0
 ; GFX9-O3-NEXT:    s_getpc_b64 s[2:3]
@@ -1606,6 +1619,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[22:23], s[2:3], 0x0
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[20:21]
 ; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[24:25]
@@ -1613,14 +1627,14 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O3-NEXT:    s_mov_b32 s13, s7
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v31, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[26:27]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[22:23]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v7, vcc
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v4
@@ -1652,109 +1666,103 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 5
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT:    s_mov_b32 s8, -1
-; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
-; GFX9-O0-NEXT:    s_mov_b32 s9, s5
+; GFX9-O0-NEXT:    s_mov_b32 s10, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
+; GFX9-O0-NEXT:    s_mov_b32 s11, s5
+; GFX9-O0-NEXT:    s_mov_b32 s8, s11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT:    s_mov_b32 s5, s10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT:    buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-O3-NEXT:    s_mov_b32 s6, -1
-; GFX9-O3-NEXT:    s_brev_b32 s7, -2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, v1, v9, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v5, v1, v11, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, v1, v13, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, -1, v12, s[4:5]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    v_mov_b32_e32 v13, v7
+; GFX9-O3-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_endpgm
   %tmp17 = shl i32 %index, 5
   %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index f6db9c4..176dfee 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -1,18 +1,30 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT
-; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX
+; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
+; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
 
 define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
 ; PTX-LABEL: grid_const_int(
-; PTX-NOT:     ld.u32
-; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0];
-; 
+; PTX:       {
+; PTX-NEXT:    .reg .b32 %r<4>;
+; PTX-NEXT:    .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    ld.param.u64 %rd1, [grid_const_int_param_2];
+; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT:    ld.param.u32 %r1, [grid_const_int_param_1];
+; PTX-NEXT:    ld.param.u32 %r2, [grid_const_int_param_0];
+; PTX-NEXT:    add.s32 %r3, %r2, %r1;
+; PTX-NEXT:    st.global.u32 [%rd2], %r3;
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_int(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) {
-; OPT-NOT:     alloca
-; OPT:         [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT:         [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
-;
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT:    [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr
+; OPT-NEXT:    [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT:    [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
+; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
+; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT3]], align 4
+; OPT-NEXT:    ret void
   %tmp = load i32, ptr %input1, align 4
   %add = add i32 %tmp, %input2
   store i32 %add, ptr %out
@@ -24,19 +36,29 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 ; PTX-LABEL: grid_const_struct(
 ; PTX:       {
-; PTX-NOT:     ld.u32
-; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_struct_param_0];
-; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_struct_param_0+4];
-;
+; PTX-NEXT:    .reg .b32 %r<4>;
+; PTX-NEXT:    .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    ld.param.u64 %rd1, [grid_const_struct_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT:    ld.param.u32 %r1, [grid_const_struct_param_0];
+; PTX-NEXT:    ld.param.u32 %r2, [grid_const_struct_param_0+4];
+; PTX-NEXT:    add.s32 %r3, %r1, %r2;
+; PTX-NEXT:    st.global.u32 [%rd2], %r3;
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_struct(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) {
-; OPT-NOT:     alloca
-; OPT:         [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
-; OPT:         [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
-; OPT:         [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
-; OPT:         [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr
+; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
+; OPT-NEXT:    [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
+; OPT-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
+; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT5]], align 4
+; OPT-NEXT:    ret void
   %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
   %int1 = load i32, ptr %gep1
@@ -49,41 +71,85 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-LABEL: grid_const_escape(
 ; PTX:       {
-; PTX-NOT:     .local
-; PTX:         cvta.param.{{.*}}
+; PTX-NEXT:    .reg .b32 %r<3>;
+; PTX-NEXT:    .reg .b64 %rd<4>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_escape_param_0;
+; PTX-NEXT:    mov.u64 %rd2, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd3, %rd2;
+; PTX-NEXT:    { // callseq 0, 0
+; PTX-NEXT:    .param .b64 param0;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd3;
+; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    call.uni (retval0),
+; PTX-NEXT:    escape,
+; PTX-NEXT:    (
+; PTX-NEXT:    param0
+; PTX-NEXT:    );
+; PTX-NEXT:    ld.param.b32 %r1, [retval0+0];
+; PTX-NEXT:    } // callseq 0
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) {
-; OPT-NOT:     alloca [[STRUCT_S]]
-; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT:         [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
+; OPT-NEXT:    ret void
   %call = call i32 @escape(ptr %input)
   ret void
 }
 
 define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
 ; PTX-LABEL: multiple_grid_const_escape(
-; PTX:         mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0;
-; PTX:         mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2;
-; PTX:         mov.{{.*}} [[RD3:%.*]], [[RD2]];
-; PTX:         mov.{{.*}} [[RD4:%.*]], [[RD1]];
-; PTX:         cvta.param.{{.*}} [[RD5:%.*]], [[RD4]];
-; PTX:         cvta.param.{{.*}} [[RD6:%.*]], [[RD3]];
-; PTX:         {
-; PTX:         st.param.b64 [param0+0], [[RD5]];
-; PTX:         st.param.b64 [param2+0], [[RD6]];
-;
+; PTX:       {
+; PTX-NEXT:    .local .align 4 .b8 __local_depot3[4];
+; PTX-NEXT:    .reg .b64 %SP;
+; PTX-NEXT:    .reg .b64 %SPL;
+; PTX-NEXT:    .reg .b32 %r<4>;
+; PTX-NEXT:    .reg .b64 %rd<9>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.u64 %SPL, __local_depot3;
+; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
+; PTX-NEXT:    mov.b64 %rd1, multiple_grid_const_escape_param_0;
+; PTX-NEXT:    mov.b64 %rd2, multiple_grid_const_escape_param_2;
+; PTX-NEXT:    mov.u64 %rd3, %rd2;
+; PTX-NEXT:    ld.param.u32 %r1, [multiple_grid_const_escape_param_1];
+; PTX-NEXT:    cvta.param.u64 %rd4, %rd3;
+; PTX-NEXT:    mov.u64 %rd5, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd6, %rd5;
+; PTX-NEXT:    add.u64 %rd7, %SP, 0;
+; PTX-NEXT:    add.u64 %rd8, %SPL, 0;
+; PTX-NEXT:    st.local.u32 [%rd8], %r1;
+; PTX-NEXT:    { // callseq 1, 0
+; PTX-NEXT:    .param .b64 param0;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd6;
+; PTX-NEXT:    .param .b64 param1;
+; PTX-NEXT:    st.param.b64 [param1+0], %rd7;
+; PTX-NEXT:    .param .b64 param2;
+; PTX-NEXT:    st.param.b64 [param2+0], %rd4;
+; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    call.uni (retval0),
+; PTX-NEXT:    escape3,
+; PTX-NEXT:    (
+; PTX-NEXT:    param0,
+; PTX-NEXT:    param1,
+; PTX-NEXT:    param2
+; PTX-NEXT:    );
+; PTX-NEXT:    ld.param.b32 %r2, [retval0+0];
+; PTX-NEXT:    } // callseq 1
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @multiple_grid_const_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) {
-; OPT:         [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NOT:     alloca %struct.s
-; OPT:         [[A_ADDR:%.*]] = alloca i32, align 4
-; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT:         [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; OPT-NEXT:    [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; OPT-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
-;
+; OPT-NEXT:    ret void
   %a.addr = alloca i32, align 4
   store i32 %a, ptr %a.addr, align 4
   %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
@@ -92,40 +158,58 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
 
 define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
 ; PTX-LABEL: grid_const_memory_escape(
-; PTX-NOT:     .local
-; PTX:         mov.b64 [[RD1:%.*]], grid_const_memory_escape_param_0;
-; PTX:         cvta.param.u64 [[RD3:%.*]], [[RD2:%.*]];
-; PTX:         st.global.u64 [[[RD4:%.*]]], [[RD3]];
-;
+; PTX:       {
+; PTX-NEXT:    .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_memory_escape_param_0;
+; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_memory_escape_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT:    mov.u64 %rd4, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT:    st.global.u64 [%rd3], %rd5;
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_memory_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) {
-; OPT-NOT:     alloca [[STRUCT_S]]
-; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT:         store ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, align 8
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
+; OPT-NEXT:    [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr
+; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT:    store ptr [[INPUT1]], ptr [[ADDR5]], align 8
+; OPT-NEXT:    ret void
   store ptr %input, ptr %addr, align 8
   ret void
 }
 
 define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
 ; PTX-LABEL: grid_const_inlineasm_escape(
-; PTX-NOT      .local 
-; PTX:         add.{{.*}} [[RD2:%.*]], [[RD1:%.*]], 4;
-; PTX:         cvta.param.u64 [[RD4:%.*]], [[RD2]]
-; PTX:         cvta.param.u64 [[RD3:%.*]], [[RD1]]
-; PTX:         add.s64 [[RD5:%.*]], [[RD3]], [[RD4]];
-;
+; PTX:       {
+; PTX-NEXT:    .reg .b64 %rd<8>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd4, grid_const_inlineasm_escape_param_0;
+; PTX-NEXT:    ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd6, %rd5;
+; PTX-NEXT:    mov.u64 %rd7, %rd4;
+; PTX-NEXT:    cvta.param.u64 %rd2, %rd7;
+; PTX-NEXT:    add.s64 %rd3, %rd2, 4;
+; PTX-NEXT:    // begin inline asm
+; PTX-NEXT:    add.s64 %rd1, %rd2, %rd3;
+; PTX-NEXT:    // end inline asm
+; PTX-NEXT:    st.global.u64 [%rd6], %rd1;
+; PTX-NEXT:    ret;
+; PTX-NOT      .local
 ; OPT-LABEL: define void @grid_const_inlineasm_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) {
-; OPT-NOT:     alloca [[STRUCT_S]]
-; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[TMPPTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 0
-; OPT:         [[TMPPTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 1
-; OPT:         [[TMPPTR22_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR22]])
-; OPT:         [[TMPPTR13_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR13]])
-; OPT:         [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1)
+; OPT-NEXT:    [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr
+; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT:    [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
+; OPT-NEXT:    [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
+; OPT-NEXT:    [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
+; OPT-NEXT:    store i64 [[TMP2]], ptr [[RESULT5]], align 8
+; OPT-NEXT:    ret void
   %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
   %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
@@ -135,24 +219,42 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
 
 define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escape(
-; PTX-NOT:     .local
-; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escape_param_0];
-; PTX:         add.{{.*}}
-; PTX:         cvta.param.u64 [[RD3:%.*]], {{%.*}}
-; PTX:         st.param.{{.*}} [param0+0], [[RD3]]
-; PTX:         call
-;
+; PTX:       {
+; PTX-NEXT:    .reg .b32 %r<5>;
+; PTX-NEXT:    .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_partial_escape_param_0;
+; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_partial_escape_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT:    mov.u64 %rd4, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT:    ld.u32 %r1, [%rd5];
+; PTX-NEXT:    add.s32 %r2, %r1, %r1;
+; PTX-NEXT:    st.global.u32 [%rd3], %r2;
+; PTX-NEXT:    { // callseq 2, 0
+; PTX-NEXT:    .param .b64 param0;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd5;
+; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    call.uni (retval0),
+; PTX-NEXT:    escape,
+; PTX-NEXT:    (
+; PTX-NEXT:    param0
+; PTX-NEXT:    );
+; PTX-NEXT:    ld.param.b32 %r3, [retval0+0];
+; PTX-NEXT:    } // callseq 2
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_partial_escape(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
-; OPT-NOT:     alloca
-; OPT:         [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT1]], align 4
-; OPT:         [[TWICE:%.*]] = add i32 [[VAL]], [[VAL]]
-; OPT:         store i32 [[TWICE]]
-; OPT:         [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
-; OPT:         [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
-; OPT:         ret void
-;
+; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
+; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
+; OPT-NEXT:    [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4
+; OPT-NEXT:    [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]]
+; OPT-NEXT:    store i32 [[TWICE]], ptr [[OUTPUT5]], align 4
+; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
+; OPT-NEXT:    ret void
   %val = load i32, ptr %input
   %twice = add i32 %val, %val
   store i32 %twice, ptr %output
@@ -163,27 +265,46 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escapemem(
 ; PTX:       {
-; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escapemem_param_0];
-; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_partial_escapemem_param_0+4];
-; PTX:         cvta.param.{{.*}} [[RD5:%.*]], {{%.*}};
-; PTX:         st.global.{{.*}} [{{.*}}], [[RD5]];
-; PTX:         add.s32 [[R3:%.*]], [[R1]], [[R2]] 
-; PTX:         st.param.{{.*}} [param0+0], [[RD5]]
-; PTX:         escape
+; PTX-NEXT:    .reg .b32 %r<6>;
+; PTX-NEXT:    .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_partial_escapemem_param_0;
+; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_partial_escapemem_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT:    mov.u64 %rd4, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT:    ld.u32 %r1, [%rd5];
+; PTX-NEXT:    ld.u32 %r2, [%rd5+4];
+; PTX-NEXT:    st.global.u64 [%rd3], %rd5;
+; PTX-NEXT:    add.s32 %r3, %r1, %r2;
+; PTX-NEXT:    { // callseq 3, 0
+; PTX-NEXT:    .param .b64 param0;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd5;
+; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    call.uni (retval0),
+; PTX-NEXT:    escape,
+; PTX-NEXT:    (
+; PTX-NEXT:    param0
+; PTX-NEXT:    );
+; PTX-NEXT:    ld.param.b32 %r4, [retval0+0];
+; PTX-NEXT:    } // callseq 3
+; PTX-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define i32 @grid_const_partial_escapemem(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
-; OPT-NOT:     alloca
-; OPT:         [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[PTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 0
-; OPT:         [[VAL1:%.*]] = load i32, ptr addrspace(101) [[PTR13]], align 4
-; OPT:         [[PTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 1
-; OPT:         [[VAL2:%.*]] = load i32, ptr addrspace(101) [[PTR22]], align 4
-; OPT:         [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
-; OPT:         store ptr [[INPUT1]]
-; OPT:         [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
-; OPT:         [[PTR1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[PTR13]])
-; OPT:         [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
+; OPT-NEXT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
+; OPT-NEXT:    [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4
+; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
+; OPT-NEXT:    [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4
+; OPT-NEXT:    store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8
+; OPT-NEXT:    [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
+; OPT-NEXT:    [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
+; OPT-NEXT:    ret i32 [[ADD]]
   %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %val1 = load i32, ptr %ptr1
   %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
@@ -194,29 +315,48 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
   ret i32 %add
 }
 
-define void @grid_const_phi_escape(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
-; PTX-LABEL: grid_const_phi_escape(
-; PTX:         cvta.param.{{.*}} [[RD1:%.*]], {{.*}}
-; PTX:         @[[P1:%.*]] bra $L__BB[[TARGET_LABEL:[_0-9]+]];
-; PTX:        $L__BB[[TARGET_LABEL]]:
-; PTX:         ld.{{.*}} [[R1:%.*]], [[[RD1]]];
-;
-; OPT-LABEL: define void @grid_const_phi_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr {{%.*}}) {
-; OPT:         [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT:         [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
-; OPT:         br i1 {{.*}}, label %[[FIRST:.*]], label %[[SECOND:.*]]
+define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
+; PTX-LABEL: grid_const_phi(
+; PTX:       {
+; PTX-NEXT:    .reg .pred %p<2>;
+; PTX-NEXT:    .reg .b32 %r<3>;
+; PTX-NEXT:    .reg .b64 %rd<9>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd5, grid_const_phi_param_0;
+; PTX-NEXT:    ld.param.u64 %rd6, [grid_const_phi_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd6;
+; PTX-NEXT:    mov.u64 %rd7, %rd5;
+; PTX-NEXT:    cvta.param.u64 %rd8, %rd7;
+; PTX-NEXT:    ld.global.u32 %r1, [%rd1];
+; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT:    @%p1 bra $L__BB8_2;
+; PTX-NEXT:  // %bb.1: // %second
+; PTX-NEXT:    add.s64 %rd8, %rd8, 4;
+; PTX-NEXT:  $L__BB8_2: // %merge
+; PTX-NEXT:    ld.u32 %r2, [%rd8];
+; PTX-NEXT:    st.global.u32 [%rd1], %r2;
+; PTX-NEXT:    ret;
+; OPT-LABEL: define void @grid_const_phi(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT:    [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT:    br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
 ; OPT:       [[FIRST]]:
-; OPT:         [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
-; OPT:         br label %[[MERGE:.*]]
+; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; OPT-NEXT:    br label %[[MERGE:.*]]
 ; OPT:       [[SECOND]]:
-; OPT:         [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
-; OPT:         br label %[[MERGE]]
+; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
+; OPT-NEXT:    br label %[[MERGE]]
 ; OPT:       [[MERGE]]:
-; OPT:         [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
-; OPT-NOT:     load i32, ptr addrspace(101)
-; OPT:         [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-;
+; OPT-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT:    ret void
 
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
@@ -235,32 +375,53 @@ merge:
 }
 
 ; NOTE: %input2 is *not* grid_constant
-define void @grid_const_phi_escape2(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
-; PTX-LABEL: grid_const_phi_escape2(
-; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_phi_escape2_param_1+4];
-; PTX:         @[[P1:%.*]] bra $L__BB[[LABEL:[_0-9]+]];
-; PTX:         cvta.param.u64 [[RD1:%.*]], [[RD2:%.*]];
-; PTX:         ld.u32 [[R1]], [[[RD1]]];
-; PTX:       $L__BB[[LABEL]]:
-; PTX:         st.global.u32 [[[RD3:%.*]]], [[R1]]
-; OPT-LABEL: define void @grid_const_phi_escape2(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr {{%.*}}) {
-; OPT:         [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
-; OPT:         [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; OPT:         [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
-; OPT:         store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
-; OPT:         [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT:         [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
-; OPT:         br i1 [[LESS:%.*]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_phi_ngc(
+; PTX:       {
+; PTX-NEXT:    .reg .pred %p<2>;
+; PTX-NEXT:    .reg .b32 %r<3>;
+; PTX-NEXT:    .reg .b64 %rd<12>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd6, grid_const_phi_ngc_param_0;
+; PTX-NEXT:    ld.param.u64 %rd7, [grid_const_phi_ngc_param_2];
+; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd7;
+; PTX-NEXT:    mov.u64 %rd10, %rd6;
+; PTX-NEXT:    cvta.param.u64 %rd11, %rd10;
+; PTX-NEXT:    ld.global.u32 %r1, [%rd1];
+; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT:    @%p1 bra $L__BB9_2;
+; PTX-NEXT:  // %bb.1: // %second
+; PTX-NEXT:    mov.b64 %rd8, grid_const_phi_ngc_param_1;
+; PTX-NEXT:    mov.u64 %rd9, %rd8;
+; PTX-NEXT:    cvta.param.u64 %rd2, %rd9;
+; PTX-NEXT:    add.s64 %rd11, %rd2, 4;
+; PTX-NEXT:  $L__BB9_2: // %merge
+; PTX-NEXT:    ld.u32 %r2, [%rd11];
+; PTX-NEXT:    st.global.u32 [%rd1], %r2;
+; PTX-NEXT:    ret;
+; OPT-LABEL: define void @grid_const_phi_ngc(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT:    [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; OPT-NEXT:    [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT:    br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
 ; OPT:       [[FIRST]]:
-; OPT:         [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
-; OPT:         br label %[[MERGE:.*]]
+; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; OPT-NEXT:    br label %[[MERGE:.*]]
 ; OPT:       [[SECOND]]:
-; OPT:         [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
-; OPT:         br label %[[MERGE]]
+; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1
+; OPT-NEXT:    br label %[[MERGE]]
 ; OPT:       [[MERGE]]:
-; OPT:         [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
-;
+; OPT-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT:    ret void
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
   br i1 %less, label %first, label %second
@@ -278,22 +439,42 @@ merge:
 }
 
 ; NOTE: %input2 is *not* grid_constant
-define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
-; PTX-LABEL: grid_const_select_escape(
-; PTX:         cvta.param.{{.*}} [[RD2:%.*]], [[RD1:%.*]]
-; PTX:         setp.lt.{{.*}} [[P1:%.*]], {{%.*}}, 0
-; PTX:         add.{{.*}} [[RD3:%.*]], %SP, 0;
-; PTX:         selp.{{.*}} [[RD4:%.*]], [[RD2]], [[RD3]], [[P1]];
-; PTX:         ld.u32 {{%.*}}, [[[RD4]]];
-; OPT-LABEL: define void @grid_const_select_escape(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) {
-; OPT:         [[INPUT24:%.*]] = alloca i32, align 4
-; OPT:         [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT:         [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
-; OPT:         load i32, ptr [[INOUT]]
-; OPT:         [[PTRNEW:%.*]] = select i1 [[LESS:%.*]], ptr [[INPUT11]], ptr [[INPUT24]]
-; OPT:         [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-;
+define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_select(
+; PTX:       {
+; PTX-NEXT:    .reg .pred %p<2>;
+; PTX-NEXT:    .reg .b32 %r<3>;
+; PTX-NEXT:    .reg .b64 %rd<10>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_select_param_0;
+; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_select_param_2];
+; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT:    mov.b64 %rd4, grid_const_select_param_1;
+; PTX-NEXT:    mov.u64 %rd5, %rd4;
+; PTX-NEXT:    cvta.param.u64 %rd6, %rd5;
+; PTX-NEXT:    mov.u64 %rd7, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd8, %rd7;
+; PTX-NEXT:    ld.global.u32 %r1, [%rd3];
+; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT:    selp.b64 %rd9, %rd8, %rd6, %p1;
+; PTX-NEXT:    ld.u32 %r2, [%rd9];
+; PTX-NEXT:    st.global.u32 [%rd3], %r2;
+; PTX-NEXT:    ret;
+; OPT-LABEL: define void @grid_const_select(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT:    [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; OPT-NEXT:    [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT:    [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
+; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT:    ret void
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
   %ptrnew = select i1 %less, ptr %input1, ptr %input2
@@ -304,16 +485,27 @@ define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(
 
 define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; PTX-LABEL: grid_const_ptrtoint(
-; PTX-NOT:     .local
-; PTX:         ld.param.{{.*}} {{%.*}}, [grid_const_ptrtoint_param_0];
-; PTX:         cvta.param.u64 [[RD1:%.*]], {{%.*}}
-; PTX:         cvt.u32.u64 {{%.*}}, [[RD1]]
+; PTX:       {
+; PTX-NEXT:    .reg .b32 %r<4>;
+; PTX-NEXT:    .reg .b64 %rd<4>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_ptrtoint_param_0;
+; PTX-NEXT:    mov.u64 %rd2, %rd1;
+; PTX-NEXT:    ld.param.u32 %r1, [grid_const_ptrtoint_param_0];
+; PTX-NEXT:    cvta.param.u64 %rd3, %rd2;
+; PTX-NEXT:    cvt.u32.u64 %r2, %rd3;
+; PTX-NEXT:    add.s32 %r3, %r1, %r2;
+; PTX-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define i32 @grid_const_ptrtoint(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) {
-; OPT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:    [[VAL:%.*]]   = load i32, ptr addrspace(101) [[INPUT2]]
-; OPT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
-; OPT:    [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
+; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT:    [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
+; OPT-NEXT:    [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
+; OPT-NEXT:    ret i32 [[KEEPALIVE]]
   %val = load i32, ptr %input
   %ptrval = ptrtoint ptr %input to i32
   %keepalive = add i32 %val, %ptrval
@@ -352,13 +544,13 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
 !14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15}
 !15 = !{i32 1}
 
-!16 = !{ptr @grid_const_phi_escape, !"kernel", i32 1, !"grid_constant", !17}
+!16 = !{ptr @grid_const_phi, !"kernel", i32 1, !"grid_constant", !17}
 !17 = !{i32 1}
 
-!18 = !{ptr @grid_const_phi_escape2, !"kernel", i32 1, !"grid_constant", !19}
+!18 = !{ptr @grid_const_phi_ngc, !"kernel", i32 1, !"grid_constant", !19}
 !19 = !{i32 1}
 
-!20 = !{ptr @grid_const_select_escape, !"kernel", i32 1, !"grid_constant", !21}
+!20 = !{ptr @grid_const_select, !"kernel", i32 1, !"grid_constant", !21}
 !21 = !{i32 1}
 
 !22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23}
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index f041f20..a414a6c 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -1,166 +1,469 @@
-; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32
-; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-
-%struct.ham = type { [4 x i32] }
-
-; // Verify that load with static offset into parameter is done directly.
-; CHECK-LABEL: .visible .entry static_offset
-; CHECK-NOT:   .local
-; CHECK64: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
-; CHECK64: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-;
-; CHECK32: ld.param.u32    [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: mov.b32         %[[param_addr:r[0-9]+]], {{.*}}_param_1
-; CHECK32: mov.u32         %[[param_addr1:r[0-9]+]], %[[param_addr]]
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-;
-; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_addr1]]+12];
-; CHECK: st.global.u32   [[[result_addr_g]]], [[value]];
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @static_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
-bb:
-  %tmp = icmp eq i32 %arg2, 3
-  br i1 %tmp, label %bb3, label %bb6
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5
+; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60
+; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70
+source_filename = "<stdin>"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.S = type { i32, i32 }
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+declare dso_local void @_Z6escapePv(ptr noundef) local_unnamed_addr #0
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+declare dso_local void @_Z6escapei(i32 noundef) local_unnamed_addr #0
 
-bb3:                                              ; preds = %bb
-  %tmp4 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 3
-  %tmp5 = load i32, ptr %tmp4, align 4
-  store i32 %tmp5, ptr %arg, align 4
-  br label %bb6
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
 
-bb6:                                              ; preds = %bb3, %bb
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @read_only(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
+; COMMON-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; COMMON-NEXT:    ret void
+;
+entry:
+  %i = load i32, ptr %s, align 4
+  store i32 %i, ptr %out, align 4
   ret void
 }
 
-; // Verify that load with dynamic offset into parameter is also done directly.
-; CHECK-LABEL: .visible .entry dynamic_offset
-; CHECK-NOT:   .local
-; CHECK64: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
-; CHECK64: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-; CHECK64: add.s64         %[[param_w_offset:rd[0-9]+]], %[[param_addr1]],
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @read_only_gep(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT:    [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; COMMON-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; COMMON-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; COMMON-NEXT:    ret void
 ;
-; CHECK32: ld.param.u32    [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: mov.b32         %[[param_addr:r[0-9]+]], {{.*}}_param_1
-; CHECK32: mov.u32         %[[param_addr1:r[0-9]+]], %[[param_addr]]
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-; CHECK32: add.s32         %[[param_w_offset:r[0-9]+]], %[[param_addr1]],
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  %i = load i32, ptr %b, align 4
+  store i32 %i, ptr %out, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @read_only_gep_asc(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT:    [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; COMMON-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; COMMON-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; COMMON-NEXT:    ret void
 ;
-; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_w_offset]]];
-; CHECK: st.global.u32   [[[result_addr_g]]], [[value]];
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  %asc = addrspacecast ptr %b to ptr addrspace(101)
+  %i = load i32, ptr addrspace(101) %asc, align 4
+  store i32 %i, ptr %out, align 4
+  ret void
+}
 
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
-bb:
-  %tmp = sext i32 %arg2 to i64
-  %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
-  %tmp4 = load i32, ptr %tmp3, align 4
-  store i32 %tmp4, ptr %arg, align 4
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @read_only_gep_asc0(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; COMMON-NEXT:    [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; COMMON-NEXT:    [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
+; COMMON-NEXT:    [[I:%.*]] = load i32, ptr [[ASC0]], align 4
+; COMMON-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; COMMON-NEXT:    ret void
+;
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  %asc = addrspacecast ptr %b to ptr addrspace(101)
+  %asc0 = addrspacecast ptr addrspace(101) %asc to ptr
+  %i = load i32, ptr %asc0, align 4
+  store i32 %i, ptr %out, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @escape_ptr(
+; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[S3]])
+; COMMON-NEXT:    ret void
+;
+entry:
+  call void @_Z6escapePv(ptr noundef nonnull %s) #0
   ret void
 }
 
-; Same as above, but with a bitcast present in the chain
-; CHECK-LABEL:.visible .entry gep_bitcast
-; CHECK-NOT: .local
-; CHECK64-DAG: ld.param.u64    [[out:%rd[0-9]+]], [gep_bitcast_param_0]
-; CHECK64-DAG: mov.b64         {{%rd[0-9]+}}, gep_bitcast_param_1
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @escape_ptr_gep(
+; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; COMMON-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[B]])
+; COMMON-NEXT:    ret void
 ;
-; CHECK32-DAG: ld.param.u32    [[out:%r[0-9]+]], [gep_bitcast_param_0]
-; CHECK32-DAG: mov.b32         {{%r[0-9]+}}, gep_bitcast_param_1
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  call void @_Z6escapePv(ptr noundef nonnull %b) #0
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @escape_ptr_store(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT:    store ptr [[S3]], ptr [[OUT2]], align 8
+; COMMON-NEXT:    ret void
 ;
-; CHECK-DAG: ld.param.u32    {{%r[0-9]+}}, [gep_bitcast_param_2]
-; CHECK64:     ld.param.u8     [[value:%rs[0-9]+]], [{{%rd[0-9]+}}]
-; CHECK64:     st.global.u8    [{{%rd[0-9]+}}], [[value]];
-; CHECK32:     ld.param.u8     [[value:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK32:     st.global.u8    [{{%r[0-9]+}}], [[value]];
+entry:
+  store ptr %s, ptr %out, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @escape_ptr_gep_store(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; COMMON-NEXT:    store ptr [[B]], ptr [[OUT2]], align 8
+; COMMON-NEXT:    ret void
+;
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  store ptr %b, ptr %out, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @escape_ptrtoint(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT:    [[I:%.*]] = ptrtoint ptr [[S3]] to i64
+; COMMON-NEXT:    store i64 [[I]], ptr [[OUT2]], align 8
+; COMMON-NEXT:    ret void
+;
+entry:
+  %i = ptrtoint ptr %s to i64
+  store i64 %i, ptr %out, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @memcpy_from_param(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
+; COMMON-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @memcpy_to_param(
+; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT:    [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
+; COMMON-NEXT:    [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
+; COMMON-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
+; COMMON-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true)
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @copy_on_store(
+; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[BB:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT:    [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
+; COMMON-NEXT:    [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
+; COMMON-NEXT:    [[I:%.*]] = load i32, ptr [[IN2]], align 4
+; COMMON-NEXT:    store i32 [[I]], ptr [[S3]], align 4
+; COMMON-NEXT:    ret void
 ;
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @gep_bitcast(ptr nocapture %out,  ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
 bb:
-  %n64 = sext i32 %n to i64
-  %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
-  %load = load i8, ptr %gep, align 4
-  store i8 %load, ptr %out, align 4
+  %i = load i32, ptr %in, align 4
+  store i32 %i, ptr %s, align 4
   ret void
 }
 
-; Same as above, but with an ASC(101) present in the chain
-; CHECK-LABEL:.visible .entry gep_bitcast_asc
-; CHECK-NOT: .local
-; CHECK64-DAG: ld.param.u64    [[out:%rd[0-9]+]], [gep_bitcast_asc_param_0]
-; CHECK64-DAG: mov.b64         {{%rd[0-9]+}}, gep_bitcast_asc_param_1
+define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
+; SM_60-LABEL: define void @test_select(
+; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
+; SM_60-NEXT:  [[BB:.*:]]
+; SM_60-NEXT:    [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; SM_60-NEXT:    [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
+; SM_60-NEXT:    [[INPUT24:%.*]] = alloca i32, align 4
+; SM_60-NEXT:    [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_60-NEXT:    [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
+; SM_60-NEXT:    store i32 [[INPUT26]], ptr [[INPUT24]], align 4
+; SM_60-NEXT:    [[INPUT11:%.*]] = alloca i32, align 4
+; SM_60-NEXT:    [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_60-NEXT:    [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
+; SM_60-NEXT:    store i32 [[INPUT13]], ptr [[INPUT11]], align 4
+; SM_60-NEXT:    [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
+; SM_60-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; SM_60-NEXT:    store i32 [[VALLOADED]], ptr [[OUT8]], align 4
+; SM_60-NEXT:    ret void
 ;
-; CHECK32-DAG: ld.param.u32    [[out:%r[0-9]+]], [gep_bitcast_asc_param_0]
-; CHECK32-DAG: mov.b32         {{%r[0-9]+}}, gep_bitcast_asc_param_1
+; SM_70-LABEL: define void @test_select(
+; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
+; SM_70-NEXT:  [[BB:.*:]]
+; SM_70-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; SM_70-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; SM_70-NEXT:    [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_70-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; SM_70-NEXT:    [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_70-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; SM_70-NEXT:    [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
+; SM_70-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; SM_70-NEXT:    store i32 [[VALLOADED]], ptr [[OUT2]], align 4
+; SM_70-NEXT:    ret void
 ;
-; CHECK-DAG: ld.param.u32    {{%r[0-9]+}}, [gep_bitcast_asc_param_2]
-; CHECK64:     ld.param.u8     [[value:%rs[0-9]+]], [{{%rd[0-9]+}}]
-; CHECK64:     st.global.u8    [{{%rd[0-9]+}}], [[value]];
-; CHECK32:     ld.param.u8     [[value:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK32:     st.global.u8    [{{%r[0-9]+}}], [[value]];
+bb:
+  %ptrnew = select i1 %cond, ptr %input1, ptr %input2
+  %valloaded = load i32, ptr %ptrnew, align 4
+  store i32 %valloaded, ptr %out, align 4
+  ret void
+}
+
+define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
+; COMMON-LABEL: define void @test_select_write(
+; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
+; COMMON-NEXT:  [[BB:.*:]]
+; COMMON-NEXT:    [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
+; COMMON-NEXT:    [[INPUT24:%.*]] = alloca i32, align 4
+; COMMON-NEXT:    [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COMMON-NEXT:    [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
+; COMMON-NEXT:    store i32 [[INPUT26]], ptr [[INPUT24]], align 4
+; COMMON-NEXT:    [[INPUT11:%.*]] = alloca i32, align 4
+; COMMON-NEXT:    [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COMMON-NEXT:    [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
+; COMMON-NEXT:    store i32 [[INPUT13]], ptr [[INPUT11]], align 4
+; COMMON-NEXT:    [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
+; COMMON-NEXT:    store i32 1, ptr [[PTRNEW]], align 4
+; COMMON-NEXT:    ret void
 ;
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @gep_bitcast_asc(ptr nocapture %out,  ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
 bb:
-  %n64 = sext i32 %n to i64
-  %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
-  %asc = addrspacecast ptr %gep to ptr addrspace(101)
-  %load = load i8, ptr addrspace(101) %asc, align 4
-  store i8 %load, ptr %out, align 4
-  ret void
-}
-
-
-; Verify that if the pointer escapes, then we do fall back onto using a temp copy.
-; CHECK-LABEL: .visible .entry pointer_escapes
-; CHECK: .local .align 4 .b8     __local_depot{{.*}}
-; CHECK64: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: add.u64         %[[copy_addr:rd[0-9]+]], %SPL, 0;
-; CHECK32: ld.param.u32    [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: add.u32         %[[copy_addr:r[0-9]+]], %SPL, 0;
-; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+12];
-; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+8];
-; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+4];
-; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1];
-; CHECK-DAG: st.local.u32    [%[[copy_addr]]+12],
-; CHECK-DAG: st.local.u32    [%[[copy_addr]]+8],
-; CHECK-DAG: st.local.u32    [%[[copy_addr]]+4],
-; CHECK-DAG: st.local.u32    [%[[copy_addr]]],
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-; CHECK64: add.s64         %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]],
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-; CHECK32: add.s32         %[[copy_w_offset:r[0-9]+]], %[[copy_addr]],
-; CHECK: ld.local.u32    [[value:%r[0-9]+]], [%[[copy_w_offset]]];
-; CHECK: st.global.u32   [[[result_addr_g]]], [[value]];
-
-; Function Attrs: convergent norecurse nounwind mustprogress
-define dso_local void @pointer_escapes(ptr nocapture %arg, ptr byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
+  %ptrnew = select i1 %cond, ptr %input1, ptr %input2
+  store i32 1, ptr %ptrnew, align 4
+  ret void
+}
+
+define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) {
+; SM_60-LABEL: define void @test_phi(
+; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
+; SM_60-NEXT:  [[BB:.*:]]
+; SM_60-NEXT:    [[INOUT7:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; SM_60-NEXT:    [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr
+; SM_60-NEXT:    [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
+; SM_60-NEXT:    [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_60-NEXT:    [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
+; SM_60-NEXT:    store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
+; SM_60-NEXT:    [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
+; SM_60-NEXT:    [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_60-NEXT:    [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4
+; SM_60-NEXT:    store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4
+; SM_60-NEXT:    br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; SM_60:       [[FIRST]]:
+; SM_60-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
+; SM_60-NEXT:    br label %[[MERGE:.*]]
+; SM_60:       [[SECOND]]:
+; SM_60-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
+; SM_60-NEXT:    br label %[[MERGE]]
+; SM_60:       [[MERGE]]:
+; SM_60-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; SM_60-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; SM_60-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT8]], align 4
+; SM_60-NEXT:    ret void
+;
+; SM_70-LABEL: define void @test_phi(
+; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
+; SM_70-NEXT:  [[BB:.*:]]
+; SM_70-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; SM_70-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; SM_70-NEXT:    [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_70-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; SM_70-NEXT:    [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_70-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; SM_70-NEXT:    br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; SM_70:       [[FIRST]]:
+; SM_70-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; SM_70-NEXT:    br label %[[MERGE:.*]]
+; SM_70:       [[SECOND]]:
+; SM_70-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1
+; SM_70-NEXT:    br label %[[MERGE]]
+; SM_70:       [[MERGE]]:
+; SM_70-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; SM_70-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; SM_70-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; SM_70-NEXT:    ret void
+;
 bb:
-  %tmp = sext i32 %arg2 to i64
-  %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
-  %tmp4 = load i32, ptr %tmp3, align 4
-  store i32 %tmp4, ptr %arg, align 4
-  %tmp5 = call ptr @escape(ptr nonnull %tmp3) #3
+  br i1 %cond, label %first, label %second
+
+first:                                            ; preds = %bb
+  %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0
+  br label %merge
+
+second:                                           ; preds = %bb
+  %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1
+  br label %merge
+
+merge:                                            ; preds = %second, %first
+  %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ]
+  %valloaded = load i32, ptr %ptrnew, align 4
+  store i32 %valloaded, ptr %inout, align 4
   ret void
 }
 
-; Function Attrs: convergent nounwind
-declare dso_local ptr @escape(ptr) local_unnamed_addr
+define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) {
+; COMMON-LABEL: define void @test_phi_write(
+; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
+; COMMON-NEXT:  [[BB:.*:]]
+; COMMON-NEXT:    [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
+; COMMON-NEXT:    [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COMMON-NEXT:    [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
+; COMMON-NEXT:    store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
+; COMMON-NEXT:    [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT:    [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COMMON-NEXT:    [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4
+; COMMON-NEXT:    store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4
+; COMMON-NEXT:    br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; COMMON:       [[FIRST]]:
+; COMMON-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
+; COMMON-NEXT:    br label %[[MERGE:.*]]
+; COMMON:       [[SECOND]]:
+; COMMON-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
+; COMMON-NEXT:    br label %[[MERGE]]
+; COMMON:       [[MERGE]]:
+; COMMON-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; COMMON-NEXT:    store i32 1, ptr [[PTRNEW]], align 4
+; COMMON-NEXT:    ret void
+;
+bb:
+  br i1 %cond, label %first, label %second
+
+first:                                            ; preds = %bb
+  %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0
+  br label %merge
+
+second:                                           ; preds = %bb
+  %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1
+  br label %merge
+
+merge:                                            ; preds = %second, %first
+  %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ]
+  store i32 1, ptr %ptrnew, align 4
+  ret void
+}
 
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
 
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !6, !7}
+!llvm.module.flags = !{!0, !1, !2, !3}
+!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!llvm.ident = !{!20, !21}
 
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]}
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{ptr @static_offset, !"kernel", i32 1}
-!4 = !{ptr @dynamic_offset, !"kernel", i32 1}
-!5 = !{ptr @pointer_escapes, !"kernel", i32 1}
-!6 = !{ptr @gep_bitcast, !"kernel", i32 1}
-!7 = !{ptr @gep_bitcast_asc, !"kernel", i32 1}
+!3 = !{i32 7, !"frame-pointer", i32 2}
+!4 = !{ptr @read_only, !"kernel", i32 1}
+!5 = !{ptr @escape_ptr, !"kernel", i32 1}
+!6 = !{ptr @escape_ptr_gep, !"kernel", i32 1}
+!7 = !{ptr @escape_ptr_store, !"kernel", i32 1}
+!8 = !{ptr @escape_ptr_gep_store, !"kernel", i32 1}
+!9 = !{ptr @escape_ptrtoint, !"kernel", i32 1}
+!10 = !{ptr @memcpy_from_param, !"kernel", i32 1}
+!11 = !{ptr @memcpy_to_param, !"kernel", i32 1}
+!12 = !{ptr @copy_on_store, !"kernel", i32 1}
+!13 = !{ptr @read_only_gep, !"kernel", i32 1}
+!14 = !{ptr @read_only_gep_asc, !"kernel", i32 1}
+!15 = !{ptr @read_only_gep_asc0, !"kernel", i32 1}
+!16 = !{ptr @test_select, !"kernel", i32 1}
+!17 = !{ptr @test_phi, !"kernel", i32 1}
+!18 = !{ptr @test_phi_write, !"kernel", i32 1}
+!19 = !{ptr @test_select_write, !"kernel", i32 1}
+!20 = !{!"clang version 20.0.0git"}
+!21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
diff --git a/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir b/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir
index 8a83543..fd3630b 100644
--- a/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir
+++ b/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir
@@ -18,6 +18,9 @@
   define i32 @foo2(i32 %a, i32 %b) comdat { ret i32 0 }
 
   define i32 @foo3(i32 %a, i32 %b) section ".abc" { ret i32 0 }
+
+  define i32 @foo4(i32 %a, i32 %b) !section_prefix !0 { ret i32 0 }
+  !0 = !{!"function_section_prefix", !"myprefix"}
 ...
 ---
 name:            foo
@@ -27,23 +30,24 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   liveins: $x10, $x11, $x13
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
+  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   liveins: $x10, $x11, $x13
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
+  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   liveins: $x10, $x11, $x13
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
+  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   PseudoRET
+  ;
   ; CHECK-FS-LABEL: name: foo
   ; CHECK-FS: bb.0:
   ; CHECK-FS-NEXT:   liveins: $x10, $x11, $x13
@@ -109,26 +113,27 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   liveins: $x10, $x11, $x13
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
   ; CHECK-NEXT:   $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   liveins: $x10, $x11, $x13
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
   ; CHECK-NEXT:   $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   liveins: $x10, $x11, $x13
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
   ; CHECK-NEXT:   $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   PseudoRET
+  ;
   ; CHECK-FS-LABEL: name: foo2
   ; CHECK-FS: bb.0:
   ; CHECK-FS-NEXT:   liveins: $x10, $x11, $x13
@@ -223,6 +228,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   PseudoRET
+  ;
   ; CHECK-FS-LABEL: name: foo3
   ; CHECK-FS: bb.0:
   ; CHECK-FS-NEXT:   liveins: $x10, $x11, $x13
@@ -289,3 +295,89 @@ body:             |
   bb.3:
     PseudoRET
 ...
+---
+name:            foo4
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: foo4
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   liveins: $x10, $x11, $x13
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+  ; CHECK-NEXT:   $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+  ; CHECK-NEXT:   PseudoBR %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $x10, $x11, $x13
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+  ; CHECK-NEXT:   $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+  ; CHECK-NEXT:   PseudoBR %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $x10, $x11, $x13
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+  ; CHECK-NEXT:   $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+  ; CHECK-NEXT:   PseudoBR %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   PseudoRET
+  ;
+  ; CHECK-FS-LABEL: name: foo4
+  ; CHECK-FS: bb.0:
+  ; CHECK-FS-NEXT:   liveins: $x10, $x11, $x13
+  ; CHECK-FS-NEXT: {{  $}}
+  ; CHECK-FS-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+  ; CHECK-FS-NEXT:   $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+  ; CHECK-FS-NEXT:   PseudoBR %bb.3
+  ; CHECK-FS-NEXT: {{  $}}
+  ; CHECK-FS-NEXT: bb.1:
+  ; CHECK-FS-NEXT:   liveins: $x10, $x11, $x13
+  ; CHECK-FS-NEXT: {{  $}}
+  ; CHECK-FS-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+  ; CHECK-FS-NEXT:   $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+  ; CHECK-FS-NEXT:   PseudoBR %bb.3
+  ; CHECK-FS-NEXT: {{  $}}
+  ; CHECK-FS-NEXT: bb.2:
+  ; CHECK-FS-NEXT:   liveins: $x10, $x11, $x13
+  ; CHECK-FS-NEXT: {{  $}}
+  ; CHECK-FS-NEXT:   $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+  ; CHECK-FS-NEXT:   $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+  ; CHECK-FS-NEXT:   PseudoBR %bb.3
+  ; CHECK-FS-NEXT: {{  $}}
+  ; CHECK-FS-NEXT: bb.3:
+  ; CHECK-FS-NEXT:   PseudoRET
+  bb.0:
+    liveins: $x10, $x11, $x13
+
+    $x11 = ORI $x11, 1023
+    $x12 = ADDI $x10, 17
+    $x11 = AND $x12, $x11
+    $x10 = SUB $x10, $x11
+    $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+    PseudoBR %bb.3
+
+  bb.1:
+    liveins: $x10, $x11, $x13
+
+    $x11 = ORI $x11, 1023
+    $x12 = ADDI $x10, 17
+    $x11 = AND $x12, $x11
+    $x10 = SUB $x10, $x11
+    $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+    PseudoBR %bb.3
+
+  bb.2:
+    liveins: $x10, $x11, $x13
+
+    $x11 = ORI $x11, 1023
+    $x12 = ADDI $x10, 17
+    $x11 = AND $x12, $x11
+    $x10 = SUB $x10, $x11
+    $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+    PseudoBR %bb.3
+
+  bb.3:
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index c0b14d2..5f0ba4a 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128 | FileCheck %s
-; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128 | FileCheck %s
 
 declare float @llvm.wasm.loadf32.f16(ptr)
 declare void @llvm.wasm.storef16.f32(float, ptr)
@@ -28,6 +27,13 @@ define <8 x half> @splat_v8f16(float %x) {
   ret <8 x half> %v
 }
 
+; CHECK-LABEL: const_splat_v8f16:
+; CHECK:       v128.const      $push0=, 20800, 0, 0, 0, 0, 0, 0, 20800
+; CHECK-NEXT:  return $pop0
+define <8 x half> @const_splat_v8f16() {
+  ret <8 x half> <half 42., half 0., half 0., half 0., half 0., half 0., half 0., half 42.>
+}
+
 ; CHECK-LABEL: extract_lane_v8f16:
 ; CHECK:       f16x8.extract_lane $push0=, $0, 1
 ; CHECK-NEXT:  return $pop0
@@ -308,3 +314,24 @@ define <8 x i16> @trunc_sat_u_v8i16_sat(<8 x half> %x) {
   %a = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> %x)
   ret <8 x i16> %a
 }
+
+; ==============================================================================
+; Load and Store
+; ==============================================================================
+define <8 x half> @load_v8f16(ptr %p) {
+; CHECK-LABEL: load_v8f16:
+; CHECK:         .functype load_v8f16 (i32) -> (v128)
+; CHECK-NEXT:    v128.load $push0=, 0($0)
+; CHECK-NEXT:    return $pop0
+  %v = load <8 x half>, ptr %p
+  ret <8 x half> %v
+}
+
+define void @store_v8f16(<8 x half> %v, ptr %p) {
+; CHECK-LABEL: store_v8f16:
+; CHECK:         .functype store_v8f16 (v128, i32) -> ()
+; CHECK-NEXT:    v128.store 0($1), $0
+; CHECK-NEXT:    return
+  store <8 x half> %v , ptr %p
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
index 430b0db..d1d1b0a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -7,8 +7,8 @@ target triple = "aarch64--linux-gnu"
 %pair = type { i8, i8 }
 
 ; CHECK-LABEL: test
-; CHECK: Found an estimated cost of 16 for VF 2 For instruction:   {{.*}} load i8
-; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: Found an estimated cost of 8 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: Found an estimated cost of 8 for VF 2 For instruction:   {{.*}} load i8
 ; CHECK-LABEL: entry:
 ; CHECK-LABEL: vector.body:
 ; CHECK: [[LOAD1:%.*]] = load i8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index 21af9ae..dec124b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -168,8 +168,8 @@ entry:
 ; gaps.
 ;
 ; VF_2-LABEL: Checking a loop in 'i64_factor_8'
-; VF_2:         Found an estimated cost of 16 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
-; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
+; VF_2:         Found an estimated cost of 8 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
 ; VF_2-NEXT:    Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
 ; VF_2-NEXT:    Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
index c7a04e3..976c6a9 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -15,10 +15,10 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i8_factor_2'
-; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1
+; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 ; VF_4-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
@@ -56,10 +56,10 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i16_factor_2'
-; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2
+; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
@@ -97,10 +97,10 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
-; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 12  for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
@@ -138,25 +138,25 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i64_factor_2'
-; VF_2:          Found an estimated cost of 44 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 44 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_2:          Found an estimated cost of 22 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_2'
-; VF_4:          Found an estimated cost of 88 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 88 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_2'
-; VF_8:          Found an estimated cost of 176 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 176 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_2'
-; VF_16:         Found an estimated cost of 352 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 352 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0
@@ -179,10 +179,10 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'f16_factor_2'
-; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2
+; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_4:          Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2
@@ -261,25 +261,25 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'f64_factor_2'
-; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_2'
-; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_2'
-; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_2'
-; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.2, ptr %data, i64 %i, i32 0
@@ -306,33 +306,33 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i8_factor_3'
-; VF_2:          Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 36 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_4-LABEL:  Checking a loop in 'i8_factor_3'
-; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp3,  ptr %tmp0, align 1
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_4-NEXT:     Found an estimated cost of 72 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp3,  ptr %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_3'
-; VF_8:          Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 144 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_3'
-; VF_16:         Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 288 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0
@@ -358,33 +358,33 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i16_factor_3'
-; VF_2:          Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 36 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_3'
-; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 72 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_3'
-; VF_8:          Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 144 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_3'
-; VF_16:         Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 288 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.3, ptr %data, i64 %i, i32 0
@@ -410,33 +410,33 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i32_factor_3'
-; VF_2:          Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 36 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_3'
-; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_8-LABEL:  Checking a loop in 'i32_factor_3'
-; VF_8:          Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 144 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_3'
-; VF_16:         Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 288 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.3, ptr %data, i64 %i, i32 0
@@ -462,33 +462,33 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i64_factor_3'
-; VF_2:          Found an estimated cost of 66 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 66 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_2:          Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_3'
-; VF_4:          Found an estimated cost of 132 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 132 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_3'
-; VF_8:          Found an estimated cost of 264 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 264 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_3'
-; VF_16:         Found an estimated cost of 528 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 528 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.3, ptr %data, i64 %i, i32 0
@@ -514,12 +514,12 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'f16_factor_3'
-; VF_2:          Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2
+; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in 'f16_factor_3'
 ; VF_4:          Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, ptr %tmp1, align 2
@@ -573,12 +573,12 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float %tmp4, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store float %tmp5, ptr %tmp2, align 4
 ; VF_4-LABEL:  Checking a loop in 'f32_factor_3'
-; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load float, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load float, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, ptr %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float %tmp3, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float %tmp4, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store float %tmp5, ptr %tmp2, align 4
+; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load float, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load float, ptr %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store float %tmp3, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store float %tmp4, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store float %tmp5, ptr %tmp2, align 4
 ; VF_8-LABEL:  Checking a loop in 'f32_factor_3'
 ; VF_8:          Found an estimated cost of 64 for VF 8 For instruction: %tmp3 = load float, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load float, ptr %tmp1, align 4
@@ -618,33 +618,33 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'f64_factor_3'
-; VF_2:          Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_3'
-; VF_4:          Found an estimated cost of 36 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 36 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_3'
-; VF_8:          Found an estimated cost of 72 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 72 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_3'
-; VF_16:         Found an estimated cost of 144 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 144 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.3, ptr %data, i64 %i, i32 0
@@ -673,41 +673,41 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i8_factor_4'
-; VF_2:          Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_2-NEXT:     Found an estimated cost of 48 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_4-LABEL: Checking a loop in 'i8_factor_4'
-; VF_4:         Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_4-NEXT:    Found an estimated cost of 96 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_4:         Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
+; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
+; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
+; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
+; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_4'
-; VF_8:          Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_8-NEXT:     Found an estimated cost of 192 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_4'
-; VF_16:         Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 384 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0
@@ -736,41 +736,41 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i16_factor_4'
-; VF_2:          Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 48 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_4'
-; VF_4:          Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 96 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_4'
-; VF_8:          Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 192 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_4'
-; VF_16:         Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 384 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.4, ptr %data, i64 %i, i32 0
@@ -799,41 +799,41 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i32_factor_4'
-; VF_2:          Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 48 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_4'
-; VF_4:          Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_8-LABEL:  Checking a loop in 'i32_factor_4'
-; VF_8:          Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 192 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_4'
-; VF_16:         Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 384 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.4, ptr %data, i64 %i, i32 0
@@ -862,41 +862,41 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i64_factor_4'
-; VF_2:          Found an estimated cost of 88 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 88 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_2:          Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_4'
-; VF_4:          Found an estimated cost of 176 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 176 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_4'
-; VF_8:          Found an estimated cost of 352 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 352 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_4'
-; VF_16:         Found an estimated cost of 704 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 704 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.4, ptr %data, i64 %i, i32 0
@@ -997,14 +997,14 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float %tmp6, ptr %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 20 for VF 2 For instruction: store float %tmp7, ptr %tmp3, align 4
 ; VF_4-LABEL:  Checking a loop in 'f32_factor_4'
-; VF_4:          Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load float, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, ptr %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, ptr %tmp3, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float %tmp4, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float %tmp5, ptr %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float %tmp6, ptr %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store float %tmp7, ptr %tmp3, align 4
+; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load float, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp6 = load float, ptr %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp7 = load float, ptr %tmp3, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store float %tmp4, ptr %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store float %tmp5, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store float %tmp6, ptr %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store float %tmp7, ptr %tmp3, align 4
 ; VF_8-LABEL:  Checking a loop in 'f32_factor_4'
 ; VF_8:          Found an estimated cost of 80 for VF 8 For instruction: %tmp4 = load float, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, ptr %tmp1, align 4
@@ -1051,41 +1051,41 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'f64_factor_4'
-; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp4 = load double, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_4'
-; VF_4:          Found an estimated cost of 48 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 48 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_4'
-; VF_8:          Found an estimated cost of 96 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 96 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_4'
-; VF_16:         Found an estimated cost of 192 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 192 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.4, ptr %data, i64 %i, i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 6d309c4..df02cb7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -176,7 +176,7 @@ declare i16 @llvm.umax.i16(i16, i16)
 ; Test case for https://github.com/llvm/llvm-project/issues/106780.
 define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0 {
 ; CHECK-LABEL: define i32 @cost_of_exit_branch_and_cond_insts(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i1 [[C:%.*]], i16 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i1 [[C:%.*]], i16 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[X]] to i32
 ; CHECK-NEXT:    [[UMAX3:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP0]], i32 111)
@@ -404,6 +404,95 @@ exit:
   ret void
 }
 
+; Test for https://github.com/llvm/llvm-project/issues/108098.
+define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %src, ptr noalias %dst) #0 {
+; CHECK-LABEL: define void @gather_interleave_group_with_dead_insert_pos(
+; CHECK-SAME: i64 [[N:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw i64 [[SMAX]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 16, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <8 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <32 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <8 x i8> [[STRIDED_VEC4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = zext <8 x i8> [[STRIDED_VEC5]] to <8 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[DST]], <8 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[DST]], <8 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP11]], <8 x ptr> [[TMP13]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP12]], <8 x ptr> [[TMP14]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L_DEAD:%.*]] = load i8, ptr [[GEP_SRC_0]], align 1
+; CHECK-NEXT:    [[IV_1:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[L_1]] to i32
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[EXT]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %gep.src.0 = getelementptr i8, ptr %src, i64 %iv
+  %l.dead = load i8, ptr %gep.src.0, align 1
+  %iv.1 = add i64 %iv, 1
+  %gep.src.1 = getelementptr i8, ptr %src, i64 %iv.1
+  %l.1 = load i8, ptr %gep.src.1, align 1
+  %ext  = zext i8 %l.1 to i32
+  %gep.dst = getelementptr i32, ptr %dst, i64 %iv
+  store i32 %ext, ptr %gep.dst, align 4
+  %iv.next = add nsw i64 %iv, 4
+  %ec = icmp slt i64 %iv, %N
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+64bit,+v" }
+
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -425,4 +514,6 @@ exit:
 ; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]}
 ; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
+; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll
index 88eb9c4..13c443c 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll
@@ -6,7 +6,8 @@
 ; Check cost function for <8 x i128> store interleave group.
 
 ; CHECK: LV: Checking a loop in 'fun'
-; CHECK: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i128 8721036757475490113
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i128 8721036757475490113
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i128 8721036757475490113
 
 define noundef i32 @fun(i32 %argc, ptr nocapture readnone %argv) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
new file mode 100644
index 0000000..3d2c2e5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
+
+; Check that we correctly handle the use of %start2 in the exit block, and do
+; not crash.
+
+define i64 @foo(ptr %p1, ptr %p2, i64 %start, i64 %end) {
+; CHECK-LABEL: define i64 @foo(
+; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]], i64 [[START:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[START2:%.*]] = and i64 [[START]], 12345
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[END]], [[START2]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START2]], [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[START2]], [[INDEX]]
+; CHECK-NEXT:    [[IND:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[IND]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[IND]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START2]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IND1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IND_NEXT1:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[IND1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[IND1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IND_NEXT1]] = add i64 [[IND1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IND_NEXT1]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[USE:%.*]] = phi i64 [ [[START2]], %[[FOR_BODY]] ], [ [[START2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[USE]]
+;
+entry:
+  %start2 = and i64 %start, 12345
+  br label %for.body
+
+for.body:
+  %ind = phi i64 [ %start2, %entry ], [ %ind.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %ind
+  %0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %ind
+  %1 = load i32, ptr %arrayidx2, align 4
+  %ind.next = add i64 %ind, 1
+  %cmp = icmp ne i64 %ind.next, %end
+  br i1 %cmp, label %for.body, label %exit
+
+exit:
+  %use =  phi i64 [ %start2, %for.body ]
+  ret i64 %use
+}
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
index 7959e4d..56ed92e 100644
--- a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
+++ b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
@@ -4,6 +4,7 @@
 ; There is no scenario currently of doing ctx profile use without thinlto.
 ;
 ; RUN: opt -passes='thinlto-pre-link<O2>' -use-ctx-profile=something_that_does_not_exist %s -S | FileCheck %s
+; RUN: opt -debug-info-for-profiling -passes='thinlto-pre-link<O2>' -use-ctx-profile=something_that_does_not_exist %s -S | FileCheck %s
 
 declare void @bar()
 
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index d28af85..a3a62f0 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -11,7 +11,23 @@
 //  llvm-dis [options] x.bc - Read LLVM bitcode from the x.bc file, write asm
 //                            to the x.ll file.
 //  Options:
-//      --help   - Output information about command line switches
+//
+//  Color Options:
+//      --color                 - Use colors in output (default=autodetect)
+//
+//  Disassembler Options:
+//      -f                      - Enable binary output on terminals
+//      --materialize-metadata  - Load module without materializing metadata,
+//                                then materialize only the metadata
+//      -o <filename>           - Override output filename
+//      --show-annotations      - Add informational comments to the .ll file
+//
+//  Generic Options:
+//      --help                  - Display available options
+//                                (--help-hidden for more)
+//      --help-list             - Display list of available options
+//                                (--help-list-hidden for more)
+//      --version               - Display the version of this program
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index b76d24d..148afd9 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -1354,14 +1354,18 @@ define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) {
   auto *BB = &*F->begin();
   auto It = BB->begin();
   auto *Select = cast<sandboxir::SelectInst>(&*It++);
+  const auto *ConstSelect = Select; // To test the const getters.
   auto *Ret = &*It++;
 
   // Check getCondition().
   EXPECT_EQ(Select->getCondition(), Cond0);
+  EXPECT_EQ(ConstSelect->getCondition(), Cond0);
   // Check getTrueValue().
   EXPECT_EQ(Select->getTrueValue(), V0);
+  EXPECT_EQ(ConstSelect->getTrueValue(), V0);
   // Check getFalseValue().
   EXPECT_EQ(Select->getFalseValue(), V1);
+  EXPECT_EQ(ConstSelect->getFalseValue(), V1);
   // Check setCondition().
   Select->setCondition(Cond1);
   EXPECT_EQ(Select->getCondition(), Cond1);
@@ -1371,6 +1375,13 @@ define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) {
   // Check setFalseValue().
   Select->setFalseValue(V0);
   EXPECT_EQ(Select->getFalseValue(), V0);
+  // Check swapValues().
+  Select->swapValues();
+  EXPECT_EQ(Select->getTrueValue(), V0);
+  EXPECT_EQ(Select->getFalseValue(), V1);
+  // Check areInvalidOperands.
+  EXPECT_EQ(sandboxir::SelectInst::areInvalidOperands(Cond0, V0, V1), nullptr);
+  EXPECT_NE(sandboxir::SelectInst::areInvalidOperands(V0, V1, Cond0), nullptr);
 
   {
     // Check SelectInst::create() InsertBefore.
diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp
index a1f39fe..a1a4117 100644
--- a/llvm/unittests/SandboxIR/TrackerTest.cpp
+++ b/llvm/unittests/SandboxIR/TrackerTest.cpp
@@ -964,6 +964,32 @@ define void @foo(i32 %cond0, i32 %cond1) {
   EXPECT_EQ(Switch->findCaseDest(BB1), One);
 }
 
+TEST_F(TrackerTest, SelectInst) {
+  parseIR(C, R"IR(
+define void @foo(i1 %c0, i8 %v0, i8 %v1) {
+  %sel = select i1 %c0, i8 %v0, i8 %v1
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *V0 = F->getArg(1);
+  auto *V1 = F->getArg(2);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Select = cast<sandboxir::SelectInst>(&*It++);
+
+  // Check tracking for swapValues.
+  Ctx.save();
+  Select->swapValues();
+  EXPECT_EQ(Select->getTrueValue(), V1);
+  EXPECT_EQ(Select->getFalseValue(), V0);
+  Ctx.revert();
+  EXPECT_EQ(Select->getTrueValue(), V0);
+  EXPECT_EQ(Select->getFalseValue(), V1);
+}
+
 TEST_F(TrackerTest, ShuffleVectorInst) {
   parseIR(C, R"IR(
 define void @foo(<2 x i8> %v1, <2 x i8> %v2) {
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index c5849b6..ef51864 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -504,7 +504,7 @@ struct MatchableInfo {
 
   /// TheDef - This is the definition of the instruction or InstAlias that this
   /// matchable came from.
-  Record *const TheDef;
+  const Record *const TheDef;
 
   // ResInstSize - The size of the resulting instruction for this matchable.
   unsigned ResInstSize;
@@ -762,7 +762,7 @@ public:
   RegisterClassesTy RegisterClasses;
 
   /// Map of Predicate records to their subtarget information.
-  std::map<Record *, SubtargetFeatureInfo, LessRecordByID> SubtargetFeatures;
+  SubtargetFeatureInfoMap SubtargetFeatures;
 
   /// Map of AsmOperandClass records to their class information.
   std::map<const Record *, ClassInfo *> AsmOperandClasses;
@@ -1338,7 +1338,7 @@ void AsmMatcherInfo::buildRegisterClasses(
   // Name the register classes which correspond to a user defined RegisterClass.
   for (const CodeGenRegisterClass &RC : RegClassList) {
     // Def will be NULL for non-user defined register classes.
-    Record *Def = RC.getDef();
+    const Record *Def = RC.getDef();
     if (!Def)
       continue;
     ClassInfo *CI = RegisterSetClasses[RegisterSet(RC.getOrder().begin(),
@@ -1513,8 +1513,8 @@ void AsmMatcherInfo::buildOperandMatchInfo() {
 
 void AsmMatcherInfo::buildInfo() {
   // Build information about all of the AssemblerPredicates.
-  const std::vector<std::pair<Record *, SubtargetFeatureInfo>>
-      &SubtargetFeaturePairs = SubtargetFeatureInfo::getAll(Records);
+  SubtargetFeaturesInfoVec SubtargetFeaturePairs =
+      SubtargetFeatureInfo::getAll(Records);
   SubtargetFeatures.insert(SubtargetFeaturePairs.begin(),
                            SubtargetFeaturePairs.end());
 #ifndef NDEBUG
@@ -3226,9 +3226,9 @@ static void emitMatchClassKindNames(std::forward_list<ClassInfo> &Infos,
 }
 
 static std::string
-getNameForFeatureBitset(const std::vector<Record *> &FeatureBitset) {
+getNameForFeatureBitset(ArrayRef<const Record *> FeatureBitset) {
   std::string Name = "AMFBS";
-  for (const auto &Feature : FeatureBitset)
+  for (const Record *Feature : FeatureBitset)
     Name += ("_" + Feature->getName()).str();
   return Name;
 }
@@ -3451,7 +3451,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   StringTable.EmitString(OS);
   OS << ";\n\n";
 
-  std::vector<std::vector<Record *>> FeatureBitsets;
+  std::vector<std::vector<const Record *>> FeatureBitsets;
   for (const auto &MI : Info.Matchables) {
     if (MI->RequiredFeatures.empty())
       continue;
@@ -3460,8 +3460,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
       FeatureBitsets.back().push_back(MI->RequiredFeatures[I]->TheDef);
   }
 
-  llvm::sort(FeatureBitsets, [&](const std::vector<Record *> &A,
-                                 const std::vector<Record *> &B) {
+  llvm::sort(FeatureBitsets, [&](const std::vector<const Record *> &A,
+                                 const std::vector<const Record *> &B) {
     if (A.size() < B.size())
       return true;
     if (A.size() > B.size())
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index 88acd79..69ca9a8 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -403,7 +403,7 @@ void CodeEmitterGen::emitInstructionBaseValues(
       << HWM.getModeName(HwMode, /*IncludeDefault=*/true) << "[] = {\n";
 
   for (const CodeGenInstruction *CGI : NumberedInstructions) {
-    Record *R = CGI->TheDef;
+    const Record *R = CGI->TheDef;
 
     if (R->getValueAsString("Namespace") == "TargetOpcode" ||
         R->getValueAsBit("isPseudo")) {
@@ -485,7 +485,7 @@ void CodeEmitterGen::run(raw_ostream &o) {
     std::set<unsigned> HwModes;
     BitWidth = 0;
     for (const CodeGenInstruction *CGI : NumberedInstructions) {
-      Record *R = CGI->TheDef;
+      const Record *R = CGI->TheDef;
       if (R->getValueAsString("Namespace") == "TargetOpcode" ||
           R->getValueAsBit("isPseudo"))
         continue;
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index fbf1d47..46aad7f 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -185,7 +185,7 @@ private:
 
   // KeyInstrVec - list of key instructions.
   std::vector<Record *> KeyInstrVec;
-  DenseMap<Record *, std::vector<Record *>> MapTable;
+  DenseMap<const Record *, std::vector<Record *>> MapTable;
 
 public:
   MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec)
@@ -371,7 +371,7 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
   // emitted as first column.
   OS << "Table[][" << NumCol + 1 << "] = {\n";
   for (unsigned i = 0; i < TotalNumInstr; i++) {
-    Record *CurInstr = NumberedInstructions[i]->TheDef;
+    const Record *CurInstr = NumberedInstructions[i]->TheDef;
     std::vector<Record *> ColInstrs = MapTable[CurInstr];
     std::string OutStr;
     unsigned RelExists = 0;
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 4582478..a77e247 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -1255,28 +1255,28 @@ bool TreePredicateFn::isAtomicOrderingWeakerThanRelease() const {
                                       false);
 }
 Record *TreePredicateFn::getMemoryVT() const {
-  Record *R = getOrigPatFragRecord()->getRecord();
+  const Record *R = getOrigPatFragRecord()->getRecord();
   if (R->isValueUnset("MemoryVT"))
     return nullptr;
   return R->getValueAsDef("MemoryVT");
 }
 
 ListInit *TreePredicateFn::getAddressSpaces() const {
-  Record *R = getOrigPatFragRecord()->getRecord();
+  const Record *R = getOrigPatFragRecord()->getRecord();
   if (R->isValueUnset("AddressSpaces"))
     return nullptr;
   return R->getValueAsListInit("AddressSpaces");
 }
 
 int64_t TreePredicateFn::getMinAlignment() const {
-  Record *R = getOrigPatFragRecord()->getRecord();
+  const Record *R = getOrigPatFragRecord()->getRecord();
   if (R->isValueUnset("MinAlignment"))
     return 0;
   return R->getValueAsInt("MinAlignment");
 }
 
 Record *TreePredicateFn::getScalarMemoryVT() const {
-  Record *R = getOrigPatFragRecord()->getRecord();
+  const Record *R = getOrigPatFragRecord()->getRecord();
   if (R->isValueUnset("ScalarMemoryVT"))
     return nullptr;
   return R->getValueAsDef("ScalarMemoryVT");
@@ -1390,7 +1390,7 @@ std::string TreePredicateFn::getCodeToRunOnSDNode() const {
     if (Tree->isLeaf())
       TreeClassName = "SDNode";
     else {
-      Record *Op = Tree->getOperator();
+      const Record *Op = Tree->getOperator();
       const SDNodeInfo &Info = PatFragRec->getDAGPatterns().getSDNodeInfo(Op);
       TreeClassName = Info.getSDClassName();
     }
@@ -1848,7 +1848,8 @@ MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const {
 // TreePatternNode implementation
 //
 
-static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
+static unsigned GetNumNodeResults(const Record *Operator,
+                                  CodeGenDAGPatterns &CDP) {
   if (Operator->getName() == "set" || Operator->getName() == "implicit")
     return 0; // All return nothing.
 
@@ -2077,7 +2078,7 @@ void TreePatternNode::InlinePatternFragments(
     return;
   }
 
-  Record *Op = getOperator();
+  const Record *Op = getOperator();
 
   if (!Op->isSubClassOf("PatFrags")) {
     if (getNumChildren() == 0) {
@@ -2340,7 +2341,7 @@ TreePatternNode::getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const {
 /// return the ComplexPattern information, otherwise return null.
 const ComplexPattern *
 TreePatternNode::getComplexPatternInfo(const CodeGenDAGPatterns &CGP) const {
-  Record *Rec;
+  const Record *Rec;
   if (isLeaf()) {
     DefInit *DI = dyn_cast<DefInit>(getLeafValue());
     if (!DI)
@@ -2793,7 +2794,7 @@ bool TreePatternNode::canPatternMatch(std::string &Reason,
 // TreePattern implementation
 //
 
-TreePattern::TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
+TreePattern::TreePattern(const Record *TheRec, ListInit *RawPat, bool isInput,
                          CodeGenDAGPatterns &cdp)
     : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
       Infer(*this) {
@@ -2801,15 +2802,15 @@ TreePattern::TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
     Trees.push_back(ParseTreePattern(I, ""));
 }
 
-TreePattern::TreePattern(Record *TheRec, DagInit *Pat, bool isInput,
+TreePattern::TreePattern(const Record *TheRec, DagInit *Pat, bool isInput,
                          CodeGenDAGPatterns &cdp)
     : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
       Infer(*this) {
   Trees.push_back(ParseTreePattern(Pat, ""));
 }
 
-TreePattern::TreePattern(Record *TheRec, TreePatternNodePtr Pat, bool isInput,
-                         CodeGenDAGPatterns &cdp)
+TreePattern::TreePattern(const Record *TheRec, TreePatternNodePtr Pat,
+                         bool isInput, CodeGenDAGPatterns &cdp)
     : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
       Infer(*this) {
   Trees.push_back(Pat);
@@ -3389,7 +3390,7 @@ static bool HandleUse(TreePattern &I, TreePatternNodePtr Pat,
     return false;
   }
 
-  Record *Rec;
+  const Record *Rec;
   if (Pat->isLeaf()) {
     DefInit *DI = dyn_cast<DefInit>(Pat->getLeafValue());
     if (!DI)
@@ -3408,7 +3409,7 @@ static bool HandleUse(TreePattern &I, TreePatternNodePtr Pat,
     Slot = Pat;
     return true;
   }
-  Record *SlotRec;
+  const Record *SlotRec;
   if (Slot->isLeaf()) {
     SlotRec = cast<DefInit>(Slot->getLeafValue())->getDef();
   } else {
@@ -3633,7 +3634,8 @@ public:
 };
 
 static bool InferFromPattern(CodeGenInstruction &InstInfo,
-                             const InstAnalyzer &PatInfo, Record *PatDef) {
+                             const InstAnalyzer &PatInfo,
+                             const Record *PatDef) {
   bool Error = false;
 
   // Remember where InstInfo got its flags.
@@ -3729,7 +3731,7 @@ static bool hasNullFragReference(ListInit *LI) {
 
 /// Get all the instructions in a tree.
 static void getInstructionsInTree(TreePatternNode &Tree,
-                                  SmallVectorImpl<Record *> &Instrs) {
+                                  SmallVectorImpl<const Record *> &Instrs) {
   if (Tree.isLeaf())
     return;
   if (Tree.getOperator()->isSubClassOf("Instruction"))
@@ -3935,8 +3937,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI,
 
   // Create and insert the instruction.
   // FIXME: InstImpResults should not be part of DAGInstruction.
-  Record *R = I.getRecord();
-  DAGInsts.try_emplace(R, std::move(Results), std::move(Operands),
+  DAGInsts.try_emplace(I.getRecord(), std::move(Results), std::move(Operands),
                        std::move(InstImpResults), SrcPattern, ResultPattern);
 
   LLVM_DEBUG(I.dump());
@@ -3989,9 +3990,7 @@ void CodeGenDAGPatterns::ParseInstructions() {
   }
 
   // If we can, convert the instructions to be patterns that are matched!
-  for (auto &Entry : Instructions) {
-    Record *Instr = Entry.first;
-    DAGInstruction &TheInst = Entry.second;
+  for (const auto &[Instr, TheInst] : Instructions) {
     TreePatternNodePtr SrcPattern = TheInst.getSrcPattern();
     TreePatternNodePtr ResultPattern = TheInst.getResultPattern();
 
@@ -4078,7 +4077,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
   for (const PatternToMatch &PTM : ptms()) {
     // We can only infer from single-instruction patterns, otherwise we won't
     // know which instruction should get the flags.
-    SmallVector<Record *, 8> PatInstrs;
+    SmallVector<const Record *, 8> PatInstrs;
     getInstructionsInTree(PTM.getDstPattern(), PatInstrs);
     if (PatInstrs.size() != 1)
       continue;
@@ -4135,7 +4134,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
 void CodeGenDAGPatterns::VerifyInstructionFlags() {
   unsigned Errors = 0;
   for (const PatternToMatch &PTM : ptms()) {
-    SmallVector<Record *, 8> Instrs;
+    SmallVector<const Record *, 8> Instrs;
     getInstructionsInTree(PTM.getDstPattern(), Instrs);
     if (Instrs.empty())
       continue;
@@ -4245,7 +4244,7 @@ static TreePatternNodePtr PromoteXForms(TreePatternNodePtr N) {
 }
 
 void CodeGenDAGPatterns::ParseOnePattern(
-    Record *TheDef, TreePattern &Pattern, TreePattern &Result,
+    const Record *TheDef, TreePattern &Pattern, TreePattern &Result,
     const std::vector<Record *> &InstImpResults, bool ShouldIgnore) {
 
   // Inline pattern fragments and expand multiple alternatives.
@@ -4591,7 +4590,7 @@ GatherChildrenOfAssociativeOpcode(TreePatternNodePtr N,
                                   std::vector<TreePatternNodePtr> &Children) {
   assert(N->getNumChildren() == 2 &&
          "Associative but doesn't have 2 children!");
-  Record *Operator = N->getOperator();
+  const Record *Operator = N->getOperator();
 
   // Only permit raw nodes.
   if (!N->getName().empty() || !N->getPredicateCalls().empty() ||
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
index 88a5437..4dc08e6 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
@@ -634,7 +634,7 @@ class TreePatternNode : public RefCountedBase<TreePatternNode> {
   /// OperatorOrVal - The Record for the operator if this is an interior node
   /// (not a leaf) or the init value (e.g. the "GPRC" record, or "7") for a
   /// leaf.
-  PointerUnion<Record *, Init *> OperatorOrVal;
+  PointerUnion<const Record *, Init *> OperatorOrVal;
 
   /// Name - The name given to this node with the :$foo notation.
   ///
@@ -657,7 +657,7 @@ class TreePatternNode : public RefCountedBase<TreePatternNode> {
   const Record *GISelFlags = nullptr;
 
 public:
-  TreePatternNode(Record *Op, std::vector<TreePatternNodePtr> Ch,
+  TreePatternNode(const Record *Op, std::vector<TreePatternNodePtr> Ch,
                   unsigned NumResults)
       : OperatorOrVal(Op), TransformFn(nullptr), Children(std::move(Ch)) {
     Types.resize(NumResults);
@@ -717,9 +717,9 @@ public:
     assert(isLeaf());
     return cast<Init *>(OperatorOrVal);
   }
-  Record *getOperator() const {
+  const Record *getOperator() const {
     assert(!isLeaf());
-    return cast<Record *>(OperatorOrVal);
+    return cast<const Record *>(OperatorOrVal);
   }
 
   unsigned getNumChildren() const { return Children.size(); }
@@ -878,7 +878,7 @@ class TreePattern {
 
   /// TheRecord - The actual TableGen record corresponding to this pattern.
   ///
-  Record *TheRecord;
+  const Record *TheRecord;
 
   /// Args - This is a list of all of the arguments to this pattern (for
   /// PatFrag patterns), which are the 'node' markers in this pattern.
@@ -908,11 +908,11 @@ class TreePattern {
 public:
   /// TreePattern constructor - Parse the specified DagInits into the
   /// current record.
-  TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
+  TreePattern(const Record *TheRec, ListInit *RawPat, bool isInput,
               CodeGenDAGPatterns &ise);
-  TreePattern(Record *TheRec, DagInit *Pat, bool isInput,
+  TreePattern(const Record *TheRec, DagInit *Pat, bool isInput,
               CodeGenDAGPatterns &ise);
-  TreePattern(Record *TheRec, TreePatternNodePtr Pat, bool isInput,
+  TreePattern(const Record *TheRec, TreePatternNodePtr Pat, bool isInput,
               CodeGenDAGPatterns &ise);
 
   /// getTrees - Return the tree patterns which corresponds to this pattern.
@@ -935,7 +935,7 @@ public:
   /// getRecord - Return the actual TableGen record corresponding to this
   /// pattern.
   ///
-  Record *getRecord() const { return TheRecord; }
+  const Record *getRecord() const { return TheRecord; }
 
   unsigned getNumArgs() const { return Args.size(); }
   const std::string &getArgName(unsigned i) const {
@@ -1054,7 +1054,7 @@ public:
 /// PatternToMatch - Used by CodeGenDAGPatterns to keep tab of patterns
 /// processed to produce isel.
 class PatternToMatch {
-  Record *SrcRecord;             // Originating Record for the pattern.
+  const Record *SrcRecord;       // Originating Record for the pattern.
   ListInit *Predicates;          // Top level predicate conditions to match.
   TreePatternNodePtr SrcPattern; // Source pattern to match.
   TreePatternNodePtr DstPattern; // Resulting pattern.
@@ -1065,16 +1065,16 @@ class PatternToMatch {
   unsigned ID;            // Unique ID for the record.
 
 public:
-  PatternToMatch(Record *srcrecord, ListInit *preds, TreePatternNodePtr src,
-                 TreePatternNodePtr dst, std::vector<Record *> dstregs,
-                 int complexity, unsigned uid, bool ignore,
-                 const Twine &hwmodefeatures = "")
+  PatternToMatch(const Record *srcrecord, ListInit *preds,
+                 TreePatternNodePtr src, TreePatternNodePtr dst,
+                 std::vector<Record *> dstregs, int complexity, unsigned uid,
+                 bool ignore, const Twine &hwmodefeatures = "")
       : SrcRecord(srcrecord), Predicates(preds), SrcPattern(src),
         DstPattern(dst), Dstregs(std::move(dstregs)),
         HwModeFeatures(hwmodefeatures.str()), AddedComplexity(complexity),
         GISelShouldIgnore(ignore), ID(uid) {}
 
-  Record *getSrcRecord() const { return SrcRecord; }
+  const Record *getSrcRecord() const { return SrcRecord; }
   ListInit *getPredicates() const { return Predicates; }
   TreePatternNode &getSrcPattern() const { return *SrcPattern; }
   TreePatternNodePtr getSrcPatternShared() const { return SrcPattern; }
@@ -1099,14 +1099,14 @@ class CodeGenDAGPatterns {
   CodeGenTarget Target;
   CodeGenIntrinsicTable Intrinsics;
 
-  std::map<Record *, SDNodeInfo, LessRecordByID> SDNodes;
-  std::map<Record *, std::pair<Record *, std::string>, LessRecordByID>
+  std::map<const Record *, SDNodeInfo, LessRecordByID> SDNodes;
+  std::map<const Record *, std::pair<Record *, std::string>, LessRecordByID>
       SDNodeXForms;
-  std::map<Record *, ComplexPattern, LessRecordByID> ComplexPatterns;
-  std::map<Record *, std::unique_ptr<TreePattern>, LessRecordByID>
+  std::map<const Record *, ComplexPattern, LessRecordByID> ComplexPatterns;
+  std::map<const Record *, std::unique_ptr<TreePattern>, LessRecordByID>
       PatternFragments;
   std::map<const Record *, DAGDefaultOperand, LessRecordByID> DefaultOperands;
-  std::map<Record *, DAGInstruction, LessRecordByID> Instructions;
+  std::map<const Record *, DAGInstruction, LessRecordByID> Instructions;
 
   // Specific SDNode definitions:
   Record *intrinsic_void_sdnode;
@@ -1134,7 +1134,7 @@ public:
 
   Record *getSDNodeNamed(StringRef Name) const;
 
-  const SDNodeInfo &getSDNodeInfo(Record *R) const {
+  const SDNodeInfo &getSDNodeInfo(const Record *R) const {
     auto F = SDNodes.find(R);
     assert(F != SDNodes.end() && "Unknown node!");
     return F->second;
@@ -1142,19 +1142,19 @@ public:
 
   // Node transformation lookups.
   typedef std::pair<Record *, std::string> NodeXForm;
-  const NodeXForm &getSDNodeTransform(Record *R) const {
+  const NodeXForm &getSDNodeTransform(const Record *R) const {
     auto F = SDNodeXForms.find(R);
     assert(F != SDNodeXForms.end() && "Invalid transform!");
     return F->second;
   }
 
-  const ComplexPattern &getComplexPattern(Record *R) const {
+  const ComplexPattern &getComplexPattern(const Record *R) const {
     auto F = ComplexPatterns.find(R);
     assert(F != ComplexPatterns.end() && "Unknown addressing mode!");
     return F->second;
   }
 
-  const CodeGenIntrinsic &getIntrinsic(Record *R) const {
+  const CodeGenIntrinsic &getIntrinsic(const Record *R) const {
     for (unsigned i = 0, e = Intrinsics.size(); i != e; ++i)
       if (Intrinsics[i].TheDef == R)
         return Intrinsics[i];
@@ -1181,20 +1181,19 @@ public:
   }
 
   // Pattern Fragment information.
-  TreePattern *getPatternFragment(Record *R) const {
+  TreePattern *getPatternFragment(const Record *R) const {
     auto F = PatternFragments.find(R);
     assert(F != PatternFragments.end() && "Invalid pattern fragment request!");
     return F->second.get();
   }
-  TreePattern *getPatternFragmentIfRead(Record *R) const {
+  TreePattern *getPatternFragmentIfRead(const Record *R) const {
     auto F = PatternFragments.find(R);
     if (F == PatternFragments.end())
       return nullptr;
     return F->second.get();
   }
 
-  typedef std::map<Record *, std::unique_ptr<TreePattern>,
-                   LessRecordByID>::const_iterator pf_iterator;
+  using pf_iterator = decltype(PatternFragments)::const_iterator;
   pf_iterator pf_begin() const { return PatternFragments.begin(); }
   pf_iterator pf_end() const { return PatternFragments.end(); }
   iterator_range<pf_iterator> ptfs() const { return PatternFragments; }
@@ -1206,11 +1205,11 @@ public:
   iterator_range<ptm_iterator> ptms() const { return PatternsToMatch; }
 
   /// Parse the Pattern for an instruction, and insert the result in DAGInsts.
-  typedef std::map<Record *, DAGInstruction, LessRecordByID> DAGInstMap;
+  typedef std::map<const Record *, DAGInstruction, LessRecordByID> DAGInstMap;
   void parseInstructionPattern(CodeGenInstruction &CGI, ListInit *Pattern,
                                DAGInstMap &DAGInsts);
 
-  const DAGInstruction &getInstruction(Record *R) const {
+  const DAGInstruction &getInstruction(const Record *R) const {
     auto F = Instructions.find(R);
     assert(F != Instructions.end() && "Unknown instruction!");
     return F->second;
@@ -1244,7 +1243,7 @@ private:
   void GenerateVariants();
   void VerifyInstructionFlags();
 
-  void ParseOnePattern(Record *TheDef, TreePattern &Pattern,
+  void ParseOnePattern(const Record *TheDef, TreePattern &Pattern,
                        TreePattern &Result,
                        const std::vector<Record *> &InstImpResults,
                        bool ShouldIgnore = false);
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
index 1cc217b..8d698fa 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
@@ -298,7 +298,8 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
   return std::pair(0U, 0U);
 }
 
-static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, Record *Rec) {
+static void ParseConstraint(StringRef CStr, CGIOperandList &Ops,
+                            const Record *Rec) {
   // EARLY_CLOBBER: @early $reg
   StringRef::size_type wpos = CStr.find_first_of(" \t");
   StringRef::size_type start = CStr.find_first_not_of(" \t");
@@ -391,7 +392,8 @@ static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, Record *Rec) {
   Ops[SrcOp.first].Constraints[SrcOp.second] = NewConstraint;
 }
 
-static void ParseConstraints(StringRef CStr, CGIOperandList &Ops, Record *Rec) {
+static void ParseConstraints(StringRef CStr, CGIOperandList &Ops,
+                             const Record *Rec) {
   if (CStr.empty())
     return;
 
@@ -428,7 +430,7 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) {
 // CodeGenInstruction Implementation
 //===----------------------------------------------------------------------===//
 
-CodeGenInstruction::CodeGenInstruction(Record *R)
+CodeGenInstruction::CodeGenInstruction(const Record *R)
     : TheDef(R), Operands(R), InferredFrom(nullptr) {
   Namespace = R->getValueAsString("Namespace");
   AsmString = std::string(R->getValueAsString("AsmString"));
@@ -501,7 +503,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
     HasComplexDeprecationPredicate = true;
     DeprecatedReason =
         std::string(R->getValueAsString("ComplexDeprecationPredicate"));
-  } else if (RecordVal *Dep = R->getValue("DeprecatedFeatureMask")) {
+  } else if (const RecordVal *Dep = R->getValue("DeprecatedFeatureMask")) {
     // Check if we have a Subtarget feature mask.
     HasComplexDeprecationPredicate = false;
     DeprecatedReason = Dep->getValue()->getAsString();
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h
index f4af0e8..3d4360f 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h
@@ -222,8 +222,8 @@ public:
 
 class CodeGenInstruction {
 public:
-  Record *TheDef;      // The actual record defining this instruction.
-  StringRef Namespace; // The namespace the instruction is in.
+  const Record *TheDef; // The actual record defining this instruction.
+  StringRef Namespace;  // The namespace the instruction is in.
 
   /// AsmString - The format string used to emit a .s file for the
   /// instruction.
@@ -297,12 +297,12 @@ public:
 
   // The record used to infer instruction flags, or NULL if no flag values
   // have been inferred.
-  Record *InferredFrom;
+  const Record *InferredFrom;
 
   // The enum value assigned by CodeGenTarget::computeInstrsByEnum.
   mutable unsigned EnumVal = 0;
 
-  CodeGenInstruction(Record *R);
+  CodeGenInstruction(const Record *R);
 
   /// HasOneImplicitDefWithKnownVT - If the instruction has at least one
   /// implicit def and it has a known VT, return the VT, otherwise return
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index cadc242..5b43f7d 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -34,7 +34,6 @@
 #include <iterator>
 #include <map>
 #include <queue>
-#include <set>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -48,7 +47,7 @@ using namespace llvm;
 //                             CodeGenSubRegIndex
 //===----------------------------------------------------------------------===//
 
-CodeGenSubRegIndex::CodeGenSubRegIndex(Record *R, unsigned Enum,
+CodeGenSubRegIndex::CodeGenSubRegIndex(const Record *R, unsigned Enum,
                                        const CodeGenHwModes &CGH)
     : TheDef(R), EnumValue(Enum), AllSuperRegsCovered(true), Artificial(true) {
   Name = std::string(R->getName());
@@ -99,7 +98,7 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) {
       PrintFatalError(TheDef->getLoc(),
                       "CoveredBySubRegs must have two or more entries");
     SmallVector<CodeGenSubRegIndex *, 8> IdxParts;
-    for (Record *Part : Parts)
+    for (const Record *Part : Parts)
       IdxParts.push_back(RegBank.getSubRegIdx(Part));
     setConcatenationOf(IdxParts);
   }
@@ -190,8 +189,7 @@ void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) {
 
   // Add ad hoc alias links. This is a symmetric relationship between two
   // registers, so build a symmetric graph by adding links in both ends.
-  std::vector<Record *> Aliases = TheDef->getValueAsListOfDefs("Aliases");
-  for (Record *Alias : Aliases) {
+  for (const Record *Alias : TheDef->getValueAsListOfDefs("Aliases")) {
     CodeGenRegister *Reg = RegBank.getReg(Alias);
     ExplicitAliases.push_back(Reg);
     Reg->ExplicitAliases.push_back(this);
@@ -757,7 +755,8 @@ static void sortAndUniqueRegisters(CodeGenRegister::Vec &M) {
   M.erase(llvm::unique(M, deref<std::equal_to<>>()), M.end());
 }
 
-CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
+CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
+                                           const Record *R)
     : TheDef(R), Name(std::string(R->getName())),
       TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1), TSFlags(0) {
   GeneratePressureSet = R->getValueAsBit("GeneratePressureSet");
@@ -765,7 +764,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
   if (TypeList.empty())
     PrintFatalError(R->getLoc(), "RegTypes list must not be empty!");
   for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
-    Record *Type = TypeList[i];
+    const Record *Type = TypeList[i];
     if (!Type->isSubClassOf("ValueType"))
       PrintFatalError(R->getLoc(),
                       "RegTypes list member '" + Type->getName() +
@@ -1168,9 +1167,9 @@ void CodeGenRegisterClass::buildRegUnitSet(
 //===----------------------------------------------------------------------===//
 
 CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank,
-                                                 Record *R)
+                                                 const Record *R)
     : TheDef(R), Name(std::string(R->getName())) {
-  for (Record *RegClass : R->getValueAsListOfDefs("Classes"))
+  for (const Record *RegClass : R->getValueAsListOfDefs("Classes"))
     Classes.push_back(RegBank.getRegClass(RegClass));
 }
 
@@ -1178,7 +1177,7 @@ CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank,
 //                               CodeGenRegBank
 //===----------------------------------------------------------------------===//
 
-CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
+CodeGenRegBank::CodeGenRegBank(const RecordKeeper &Records,
                                const CodeGenHwModes &Modes)
     : CGH(Modes) {
   // Configure register Sets to understand register classes and tuples.
@@ -1189,10 +1188,8 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
 
   // Read in the user-defined (named) sub-register indices.
   // More indices will be synthesized later.
-  std::vector<Record *> SRIs = Records.getAllDerivedDefinitions("SubRegIndex");
-  llvm::sort(SRIs, LessRecord());
-  for (unsigned i = 0, e = SRIs.size(); i != e; ++i)
-    getSubRegIdx(SRIs[i]);
+  for (const Record *SRI : Records.getAllDerivedDefinitions("SubRegIndex"))
+    getSubRegIdx(SRI);
   // Build composite maps from ComposedOf fields.
   for (auto &Idx : SubRegIndices)
     Idx.updateComponents(*this);
@@ -1223,7 +1220,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
       getReg(Regs[i]);
 
     // Expand tuples and number the new registers.
-    for (Record *R : Records.getAllDerivedDefinitions("RegisterTuples")) {
+    for (const Record *R : Records.getAllDerivedDefinitions("RegisterTuples")) {
       std::vector<const Record *> TupRegs = *Sets.expand(R);
       llvm::sort(TupRegs, LessRecordRegister());
       for (const Record *RC : TupRegs)
@@ -1288,7 +1285,8 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
   NumNativeRegUnits = RegUnits.size();
 
   // Read in register class definitions.
-  std::vector<Record *> RCs = Records.getAllDerivedDefinitions("RegisterClass");
+  ArrayRef<const Record *> RCs =
+      Records.getAllDerivedDefinitions("RegisterClass");
   if (RCs.empty())
     PrintFatalError("No 'RegisterClass' subclasses defined!");
 
@@ -1311,9 +1309,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
   CodeGenRegisterClass::computeSubClasses(*this);
 
   // Read in the register category definitions.
-  std::vector<Record *> RCats =
-      Records.getAllDerivedDefinitions("RegisterCategory");
-  for (auto *R : RCats)
+  for (const Record *R : Records.getAllDerivedDefinitions("RegisterCategory"))
     RegCategories.emplace_back(*this, R);
 }
 
@@ -1324,7 +1320,7 @@ CodeGenSubRegIndex *CodeGenRegBank::createSubRegIndex(StringRef Name,
   return &SubRegIndices.back();
 }
 
-CodeGenSubRegIndex *CodeGenRegBank::getSubRegIdx(Record *Def) {
+CodeGenSubRegIndex *CodeGenRegBank::getSubRegIdx(const Record *Def) {
   CodeGenSubRegIndex *&Idx = Def2SubRegIdx[Def];
   if (Idx)
     return Idx;
@@ -2450,7 +2446,8 @@ void CodeGenRegBank::computeInferredRegisterClasses() {
 /// return null. If the register is in multiple classes, and the classes have a
 /// superset-subset relationship and the same set of types, return the
 /// superclass.  Otherwise return null.
-const CodeGenRegisterClass *CodeGenRegBank::getRegClassForRegister(Record *R) {
+const CodeGenRegisterClass *
+CodeGenRegBank::getRegClassForRegister(const Record *R) {
   const CodeGenRegister *Reg = getReg(R);
   const CodeGenRegisterClass *FoundRC = nullptr;
   for (const auto &RC : getRegClasses()) {
@@ -2490,7 +2487,7 @@ const CodeGenRegisterClass *CodeGenRegBank::getRegClassForRegister(Record *R) {
 }
 
 const CodeGenRegisterClass *
-CodeGenRegBank::getMinimalPhysRegClass(Record *RegRecord,
+CodeGenRegBank::getMinimalPhysRegClass(const Record *RegRecord,
                                        ValueTypeByHwMode *VT) {
   const CodeGenRegister *Reg = getReg(RegRecord);
   const CodeGenRegisterClass *BestRC = nullptr;
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h
index f0f53d3..2fa6cab 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h
@@ -63,7 +63,7 @@ struct MaskRolPair {
 
 /// CodeGenSubRegIndex - Represents a sub-register index.
 class CodeGenSubRegIndex {
-  Record *const TheDef;
+  const Record *const TheDef;
   std::string Name;
   std::string Namespace;
 
@@ -85,7 +85,7 @@ public:
   // indexes are not used to create new register classes.
   bool Artificial;
 
-  CodeGenSubRegIndex(Record *R, unsigned Enum, const CodeGenHwModes &CGH);
+  CodeGenSubRegIndex(const Record *R, unsigned Enum, const CodeGenHwModes &CGH);
   CodeGenSubRegIndex(StringRef N, StringRef Nspace, unsigned Enum);
   CodeGenSubRegIndex(CodeGenSubRegIndex &) = delete;
 
@@ -320,7 +320,7 @@ class CodeGenRegisterClass {
   // List of super-classes, topologocally ordered to have the larger classes
   // first.  This is the same as sorting by EnumValue.
   SmallVector<CodeGenRegisterClass *, 4> SuperClasses;
-  Record *TheDef;
+  const Record *TheDef;
   std::string Name;
 
   // For a synthesized class, inherit missing properties from the nearest
@@ -368,7 +368,7 @@ public:
 
   // Return the Record that defined this class, or NULL if the class was
   // created by TableGen.
-  Record *getDef() const { return TheDef; }
+  const Record *getDef() const { return TheDef; }
 
   std::string getNamespaceQualification() const;
   const std::string &getName() const { return Name; }
@@ -473,7 +473,7 @@ public:
   void buildRegUnitSet(const CodeGenRegBank &RegBank,
                        std::vector<unsigned> &RegUnits) const;
 
-  CodeGenRegisterClass(CodeGenRegBank &, Record *R);
+  CodeGenRegisterClass(CodeGenRegBank &, const Record *R);
   CodeGenRegisterClass(CodeGenRegisterClass &) = delete;
 
   // A key representing the parts of a register class used for forming
@@ -511,17 +511,17 @@ public:
 // register falls into (GPR, vector, fixed, etc.) without having to know
 // specific information about the target architecture.
 class CodeGenRegisterCategory {
-  Record *TheDef;
+  const Record *TheDef;
   std::string Name;
   std::list<CodeGenRegisterClass *> Classes;
 
 public:
-  CodeGenRegisterCategory(CodeGenRegBank &, Record *R);
+  CodeGenRegisterCategory(CodeGenRegBank &, const Record *R);
   CodeGenRegisterCategory(CodeGenRegisterCategory &) = delete;
 
   // Return the Record that defined this class, or NULL if the class was
   // created by TableGen.
-  Record *getDef() const { return TheDef; }
+  const Record *getDef() const { return TheDef; }
 
   std::string getName() const { return Name; }
   std::list<CodeGenRegisterClass *> getClasses() const { return Classes; }
@@ -585,7 +585,7 @@ class CodeGenRegBank {
   const CodeGenHwModes &CGH;
 
   std::deque<CodeGenSubRegIndex> SubRegIndices;
-  DenseMap<Record *, CodeGenSubRegIndex *> Def2SubRegIdx;
+  DenseMap<const Record *, CodeGenSubRegIndex *> Def2SubRegIdx;
 
   CodeGenSubRegIndex *createSubRegIndex(StringRef Name, StringRef NameSpace);
 
@@ -612,7 +612,6 @@ class CodeGenRegBank {
 
   // Register categories.
   std::list<CodeGenRegisterCategory> RegCategories;
-  DenseMap<Record *, CodeGenRegisterCategory *> Def2RCat;
   using RCatKeyMap =
       std::map<CodeGenRegisterClass::Key, CodeGenRegisterCategory *>;
   RCatKeyMap Key2RCat;
@@ -677,7 +676,7 @@ class CodeGenRegBank {
   void computeRegUnitLaneMasks();
 
 public:
-  CodeGenRegBank(RecordKeeper &, const CodeGenHwModes &);
+  CodeGenRegBank(const RecordKeeper &, const CodeGenHwModes &);
   CodeGenRegBank(CodeGenRegBank &) = delete;
 
   SetTheory &getSets() { return Sets; }
@@ -693,7 +692,7 @@ public:
 
   // Find a SubRegIndex from its Record def or add to the list if it does
   // not exist there yet.
-  CodeGenSubRegIndex *getSubRegIdx(Record *);
+  CodeGenSubRegIndex *getSubRegIdx(const Record *);
 
   // Find a SubRegIndex from its Record def.
   const CodeGenSubRegIndex *findSubRegIdx(const Record *Def) const;
@@ -785,14 +784,15 @@ public:
   /// class, return null. If the register is in multiple classes, and the
   /// classes have a superset-subset relationship and the same set of types,
   /// return the superclass.  Otherwise return null.
-  const CodeGenRegisterClass *getRegClassForRegister(Record *R);
+  const CodeGenRegisterClass *getRegClassForRegister(const Record *R);
 
   // Analog of TargetRegisterInfo::getMinimalPhysRegClass. Unlike
   // getRegClassForRegister, this tries to find the smallest class containing
   // the physical register. If \p VT is specified, it will only find classes
   // with a matching type
   const CodeGenRegisterClass *
-  getMinimalPhysRegClass(Record *RegRecord, ValueTypeByHwMode *VT = nullptr);
+  getMinimalPhysRegClass(const Record *RegRecord,
+                         ValueTypeByHwMode *VT = nullptr);
 
   // Get the sum of unit weights.
   unsigned getRegUnitSetWeight(const std::vector<unsigned> &Units) const {
diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
index 0a48fea..3dcfdc9 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
@@ -609,7 +609,7 @@ void CodeGenSchedModels::collectSchedRW() {
   // Find all SchedReadWrites referenced by instruction defs.
   RecVec SWDefs, SRDefs;
   for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
-    Record *SchedDef = Inst->TheDef;
+    const Record *SchedDef = Inst->TheDef;
     if (SchedDef->isValueUnset("SchedRW"))
       continue;
     RecVec RWs = SchedDef->getValueAsListOfDefs("SchedRW");
diff --git a/llvm/utils/TableGen/Common/DAGISelMatcher.h b/llvm/utils/TableGen/Common/DAGISelMatcher.h
index 81a5e3e..49dc472 100644
--- a/llvm/utils/TableGen/Common/DAGISelMatcher.h
+++ b/llvm/utils/TableGen/Common/DAGISelMatcher.h
@@ -983,14 +983,14 @@ private:
 /// recorded node and records the result.
 class EmitNodeXFormMatcher : public Matcher {
   unsigned Slot;
-  Record *NodeXForm;
+  const Record *NodeXForm;
 
 public:
-  EmitNodeXFormMatcher(unsigned slot, Record *nodeXForm)
+  EmitNodeXFormMatcher(unsigned slot, const Record *nodeXForm)
       : Matcher(EmitNodeXForm), Slot(slot), NodeXForm(nodeXForm) {}
 
   unsigned getSlot() const { return Slot; }
-  Record *getNodeXForm() const { return NodeXForm; }
+  const Record *getNodeXForm() const { return NodeXForm; }
 
   static bool classof(const Matcher *N) {
     return N->getKind() == EmitNodeXForm;
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 139bf2d..0779b1e 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -822,7 +822,7 @@ SaveAndRestore<GISelFlags> RuleMatcher::setGISelFlags(const Record *R) {
 }
 
 Error RuleMatcher::defineComplexSubOperand(StringRef SymbolicName,
-                                           Record *ComplexPattern,
+                                           const Record *ComplexPattern,
                                            unsigned RendererID,
                                            unsigned SubOperandID,
                                            StringRef ParentSymbolicName) {
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index 5b517b1..94f26d8 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -523,7 +523,7 @@ protected:
 
   ArrayRef<SMLoc> SrcLoc;
 
-  typedef std::tuple<Record *, unsigned, unsigned>
+  typedef std::tuple<const Record *, unsigned, unsigned>
       DefinedComplexPatternSubOperand;
   typedef StringMap<DefinedComplexPatternSubOperand>
       DefinedComplexPatternSubOperandMap;
@@ -649,7 +649,8 @@ public:
 
   void definePhysRegOperand(Record *Reg, OperandMatcher &OM);
 
-  Error defineComplexSubOperand(StringRef SymbolicName, Record *ComplexPattern,
+  Error defineComplexSubOperand(StringRef SymbolicName,
+                                const Record *ComplexPattern,
                                 unsigned RendererID, unsigned SubOperandID,
                                 StringRef ParentSymbolicName);
 
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.cpp b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
index 7425e7f..e5e8225 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.cpp
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
@@ -28,7 +28,8 @@ std::string llvm::getModeName(unsigned Mode) {
   return (Twine('m') + Twine(Mode)).str();
 }
 
-ValueTypeByHwMode::ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH) {
+ValueTypeByHwMode::ValueTypeByHwMode(const Record *R,
+                                     const CodeGenHwModes &CGH) {
   const HwModeSelect &MS = CGH.getHwModeSelect(R);
   for (const HwModeSelect::PairType &P : MS.Items) {
     auto I = Map.insert({P.first, MVT(llvm::getValueType(P.second))});
@@ -39,7 +40,8 @@ ValueTypeByHwMode::ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH) {
     PtrAddrSpace = R->getValueAsInt("AddrSpace");
 }
 
-ValueTypeByHwMode::ValueTypeByHwMode(Record *R, MVT T) : ValueTypeByHwMode(T) {
+ValueTypeByHwMode::ValueTypeByHwMode(const Record *R, MVT T)
+    : ValueTypeByHwMode(T) {
   if (R->isSubClassOf("PtrValueType"))
     PtrAddrSpace = R->getValueAsInt("AddrSpace");
 }
@@ -102,7 +104,7 @@ void ValueTypeByHwMode::writeToStream(raw_ostream &OS) const {
 LLVM_DUMP_METHOD
 void ValueTypeByHwMode::dump() const { dbgs() << *this << '\n'; }
 
-ValueTypeByHwMode llvm::getValueTypeByHwMode(Record *Rec,
+ValueTypeByHwMode llvm::getValueTypeByHwMode(const Record *Rec,
                                              const CodeGenHwModes &CGH) {
 #ifndef NDEBUG
   if (!Rec->isSubClassOf("ValueType"))
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.h b/llvm/utils/TableGen/Common/InfoByHwMode.h
index a6edf3c..4f11e8e 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.h
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.h
@@ -152,8 +152,8 @@ protected:
 };
 
 struct ValueTypeByHwMode : public InfoByHwMode<MVT> {
-  ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH);
-  ValueTypeByHwMode(Record *R, MVT T);
+  ValueTypeByHwMode(const Record *R, const CodeGenHwModes &CGH);
+  ValueTypeByHwMode(const Record *R, MVT T);
   ValueTypeByHwMode(MVT T) { Map.insert({DefaultMode, T}); }
   ValueTypeByHwMode() = default;
 
@@ -174,7 +174,8 @@ struct ValueTypeByHwMode : public InfoByHwMode<MVT> {
   }
 };
 
-ValueTypeByHwMode getValueTypeByHwMode(Record *Rec, const CodeGenHwModes &CGH);
+ValueTypeByHwMode getValueTypeByHwMode(const Record *Rec,
+                                       const CodeGenHwModes &CGH);
 
 raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T);
 
diff --git a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
index a4d6d8d..738ddf7 100644
--- a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
+++ b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
@@ -20,12 +20,10 @@ LLVM_DUMP_METHOD void SubtargetFeatureInfo::dump() const {
 }
 #endif
 
-std::vector<std::pair<Record *, SubtargetFeatureInfo>>
-SubtargetFeatureInfo::getAll(RecordKeeper &Records) {
-  std::vector<std::pair<Record *, SubtargetFeatureInfo>> SubtargetFeatures;
-  std::vector<Record *> AllPredicates =
-      Records.getAllDerivedDefinitions("Predicate");
-  for (Record *Pred : AllPredicates) {
+SubtargetFeaturesInfoVec
+SubtargetFeatureInfo::getAll(const RecordKeeper &Records) {
+  SubtargetFeaturesInfoVec SubtargetFeatures;
+  for (const Record *Pred : Records.getAllDerivedDefinitions("Predicate")) {
     // Ignore predicates that are not intended for the assembler.
     //
     // The "AssemblerMatcherPredicate" string should be promoted to an argument
diff --git a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h
index fee2c02..d75a9a4 100644
--- a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h
+++ b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h
@@ -19,18 +19,20 @@
 namespace llvm {
 struct SubtargetFeatureInfo;
 using SubtargetFeatureInfoMap =
-    std::map<Record *, SubtargetFeatureInfo, LessRecordByID>;
+    std::map<const Record *, SubtargetFeatureInfo, LessRecordByID>;
+using SubtargetFeaturesInfoVec =
+    std::vector<std::pair<const Record *, SubtargetFeatureInfo>>;
 
 /// Helper class for storing information on a subtarget feature which
 /// participates in instruction matching.
 struct SubtargetFeatureInfo {
   /// The predicate record for this feature.
-  Record *TheDef;
+  const Record *TheDef;
 
   /// An unique index assigned to represent this feature.
   uint64_t Index;
 
-  SubtargetFeatureInfo(Record *D, uint64_t Idx) : TheDef(D), Index(Idx) {}
+  SubtargetFeatureInfo(const Record *D, uint64_t Idx) : TheDef(D), Index(Idx) {}
 
   /// The name of the enumerated constant identifying this feature.
   std::string getEnumName() const {
@@ -48,8 +50,8 @@ struct SubtargetFeatureInfo {
   }
 
   void dump() const;
-  static std::vector<std::pair<Record *, SubtargetFeatureInfo>>
-  getAll(RecordKeeper &Records);
+
+  static SubtargetFeaturesInfoVec getAll(const RecordKeeper &Records);
 
   /// Emit the subtarget feature flag definitions.
   ///
diff --git a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp
index 049bd37..ce4cd35 100644
--- a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp
@@ -77,7 +77,7 @@ class VarLenCodeEmitterGen {
   // name suffix to improve readability of the generated code.
   std::map<AltEncodingTy, std::string> Modes;
 
-  DenseMap<Record *, DenseMap<AltEncodingTy, VarLenInst>> VarLenInsts;
+  DenseMap<const Record *, DenseMap<AltEncodingTy, VarLenInst>> VarLenInsts;
 
   // Emit based values (i.e. fixed bits in the encoded instructions)
   void emitInstructionBaseValues(
@@ -227,7 +227,7 @@ void VarLenCodeEmitterGen::run(raw_ostream &OS) {
   auto NumberedInstructions = Target.getInstructionsByEnumValue();
 
   for (const CodeGenInstruction *CGI : NumberedInstructions) {
-    Record *R = CGI->TheDef;
+    const Record *R = CGI->TheDef;
     // Create the corresponding VarLenInst instance.
     if (R->getValueAsString("Namespace") == "TargetOpcode" ||
         R->getValueAsBit("isPseudo"))
@@ -249,7 +249,7 @@ void VarLenCodeEmitterGen::run(raw_ostream &OS) {
         continue;
       }
     }
-    RecordVal *RV = R->getValue("Inst");
+    const RecordVal *RV = R->getValue("Inst");
     DagInit *DI = cast<DagInit>(RV->getValue());
     VarLenInsts[R].insert({Universal, VarLenInst(DI, RV)});
   }
@@ -356,7 +356,7 @@ void VarLenCodeEmitterGen::emitInstructionBaseValues(
 
   unsigned NumFixedValueWords = 0U;
   for (const CodeGenInstruction *CGI : NumberedInstructions) {
-    Record *R = CGI->TheDef;
+    const Record *R = CGI->TheDef;
 
     if (R->getValueAsString("Namespace") == "TargetOpcode" ||
         R->getValueAsBit("isPseudo")) {
diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp
index b43a8e6..6c72103 100644
--- a/llvm/utils/TableGen/DAGISelEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelEmitter.cpp
@@ -47,7 +47,7 @@ static unsigned getResultPatternCost(TreePatternNode &P,
     return 0;
 
   unsigned Cost = 0;
-  Record *Op = P.getOperator();
+  const Record *Op = P.getOperator();
   if (Op->isSubClassOf("Instruction")) {
     Cost++;
     CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op);
@@ -67,7 +67,7 @@ static unsigned getResultPatternSize(TreePatternNode &P,
     return 0;
 
   unsigned Cost = 0;
-  Record *Op = P.getOperator();
+  const Record *Op = P.getOperator();
   if (Op->isSubClassOf("Instruction")) {
     Cost += Op->getValueAsInt("CodeSize");
   }
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index a14cc3d..96a40f0 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -64,8 +64,8 @@ class MatcherTableEmitter {
 
   std::vector<const ComplexPattern *> ComplexPatterns;
 
-  DenseMap<Record *, unsigned> NodeXFormMap;
-  std::vector<Record *> NodeXForms;
+  DenseMap<const Record *, unsigned> NodeXFormMap;
+  std::vector<const Record *> NodeXForms;
 
   std::vector<std::string> VecIncludeStrings;
   MapVector<std::string, unsigned, StringMap<unsigned>> VecPatterns;
@@ -203,7 +203,7 @@ private:
     return llvm::find(ComplexPatterns, &P) - ComplexPatterns.begin();
   }
 
-  unsigned getNodeXFormID(Record *Rec) {
+  unsigned getNodeXFormID(const Record *Rec) {
     unsigned &Entry = NodeXFormMap[Rec];
     if (Entry == 0) {
       NodeXForms.push_back(Rec);
@@ -930,7 +930,7 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
             GetPatFromTreePatternNode(SNT->getPattern().getSrcPattern());
         std::string dst =
             GetPatFromTreePatternNode(SNT->getPattern().getDstPattern());
-        Record *PatRecord = SNT->getPattern().getSrcRecord();
+        const Record *PatRecord = SNT->getPattern().getSrcRecord();
         std::string include_src = getIncludePath(PatRecord);
         unsigned Offset =
             getPatternIdxFromTable(src + " -> " + dst, std::move(include_src));
@@ -1043,7 +1043,7 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
           GetPatFromTreePatternNode(CM->getPattern().getSrcPattern());
       std::string dst =
           GetPatFromTreePatternNode(CM->getPattern().getDstPattern());
-      Record *PatRecord = CM->getPattern().getSrcRecord();
+      const Record *PatRecord = CM->getPattern().getSrcRecord();
       std::string include_src = getIncludePath(PatRecord);
       unsigned Offset =
           getPatternIdxFromTable(src + " -> " + dst, std::move(include_src));
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 4e65690..bb8f4dc 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -746,7 +746,7 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode &N,
 
 static bool mayInstNodeLoadOrStore(const TreePatternNode &N,
                                    const CodeGenDAGPatterns &CGP) {
-  Record *Op = N.getOperator();
+  const Record *Op = N.getOperator();
   const CodeGenTarget &CGT = CGP.getTargetInfo();
   CodeGenInstruction &II = CGT.getInstruction(Op);
   return II.mayLoad || II.mayStore;
@@ -757,7 +757,7 @@ static unsigned numNodesThatMayLoadOrStore(const TreePatternNode &N,
   if (N.isLeaf())
     return 0;
 
-  Record *OpRec = N.getOperator();
+  const Record *OpRec = N.getOperator();
   if (!OpRec->isSubClassOf("Instruction"))
     return 0;
 
@@ -773,7 +773,7 @@ static unsigned numNodesThatMayLoadOrStore(const TreePatternNode &N,
 
 void MatcherGen::EmitResultInstructionAsOperand(
     const TreePatternNode &N, SmallVectorImpl<unsigned> &OutputOps) {
-  Record *Op = N.getOperator();
+  const Record *Op = N.getOperator();
   const CodeGenTarget &CGT = CGP.getTargetInfo();
   CodeGenInstruction &II = CGT.getInstruction(Op);
   const DAGInstruction &Inst = CGP.getInstruction(Op);
@@ -1010,7 +1010,7 @@ void MatcherGen::EmitResultOperand(const TreePatternNode &N,
   if (N.isLeaf())
     return EmitResultLeafAsOperand(N, ResultOps);
 
-  Record *OpRec = N.getOperator();
+  const Record *OpRec = N.getOperator();
   if (OpRec->isSubClassOf("Instruction"))
     return EmitResultInstructionAsOperand(N, ResultOps);
   if (OpRec->isSubClassOf("SDNodeXForm"))
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index 2ef98b3..01df873 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -236,7 +236,7 @@ struct OperandsSignature {
           // not needed and just bloat the fast instruction selector.  For
           // example, X86 doesn't need to generate code to match ADD16ri8 since
           // ADD16ri will do just fine.
-          Record *Rec = PredFn.getOrigPatFragRecord()->getRecord();
+          const Record *Rec = PredFn.getOrigPatFragRecord()->getRecord();
           if (Rec->getValueAsBit("FastIselShouldIgnore"))
             return false;
 
@@ -417,7 +417,7 @@ private:
 };
 } // End anonymous namespace
 
-static std::string getOpcodeName(Record *Op, CodeGenDAGPatterns &CGP) {
+static std::string getOpcodeName(const Record *Op, CodeGenDAGPatterns &CGP) {
   return std::string(CGP.getSDNodeInfo(Op).getEnumName());
 }
 
@@ -461,7 +461,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     TreePatternNode &Dst = Pattern.getDstPattern();
     if (Dst.isLeaf())
       continue;
-    Record *Op = Dst.getOperator();
+    const Record *Op = Dst.getOperator();
     if (!Op->isSubClassOf("Instruction"))
       continue;
     CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op);
@@ -524,7 +524,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     if (InstPatNode.getNumTypes() > 1)
       continue;
 
-    Record *InstPatOp = InstPatNode.getOperator();
+    const Record *InstPatOp = InstPatNode.getOperator();
     std::string OpcodeName = getOpcodeName(InstPatOp, CGP);
     MVT::SimpleValueType RetVT = MVT::isVoid;
     if (InstPatNode.getNumTypes())
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index b2f4d32..d82f1c3 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -120,7 +120,7 @@ static std::string explainPredicates(const TreePatternNode &N) {
     if (P.isTruncStore())
       Explanation += " truncstore";
 
-    if (Record *VT = P.getMemoryVT())
+    if (const Record *VT = P.getMemoryVT())
       Explanation += (" MemVT=" + VT->getName()).str();
     if (Record *VT = P.getScalarMemoryVT())
       Explanation += (" ScalarVT(MemVT)=" + VT->getName()).str();
@@ -168,7 +168,7 @@ static std::string explainPredicates(const TreePatternNode &N) {
   return Explanation;
 }
 
-std::string explainOperator(Record *Operator) {
+std::string explainOperator(const Record *Operator) {
   if (Operator->isSubClassOf("SDNode"))
     return (" (" + Operator->getValueAsString("Opcode") + ")").str();
 
@@ -346,7 +346,7 @@ private:
   /// SDNodes to the GINodeEquiv mapping. We need to map to the GINodeEquiv to
   /// check for attributes on the relation such as CheckMMOIsNonAtomic.
   /// This is defined using 'GINodeEquiv' in the target description.
-  DenseMap<Record *, Record *> NodeEquivs;
+  DenseMap<const Record *, Record *> NodeEquivs;
 
   /// Keep track of the equivalence between ComplexPattern's and
   /// GIComplexOperandMatcher. Map entries are specified by subclassing
@@ -379,7 +379,7 @@ private:
   void gatherTypeIDValues();
   void gatherNodeEquivs();
 
-  Record *findNodeEquiv(Record *N) const;
+  Record *findNodeEquiv(const Record *N) const;
   const CodeGenInstruction *getEquivNode(Record &Equiv,
                                          const TreePatternNode &N) const;
 
@@ -388,7 +388,7 @@ private:
   createAndImportSelDAGMatcher(RuleMatcher &Rule,
                                InstructionMatcher &InsnMatcher,
                                const TreePatternNode &Src, unsigned &TempOpIdx);
-  Error importComplexPatternOperandMatcher(OperandMatcher &OM, Record *R,
+  Error importComplexPatternOperandMatcher(OperandMatcher &OM, const Record *R,
                                            unsigned &TempOpIdx) const;
   Error importChildMatcher(RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
                            const TreePatternNode &SrcChild,
@@ -504,7 +504,7 @@ void GlobalISelEmitter::gatherNodeEquivs() {
   }
 }
 
-Record *GlobalISelEmitter::findNodeEquiv(Record *N) const {
+Record *GlobalISelEmitter::findNodeEquiv(const Record *N) const {
   return NodeEquivs.lookup(N);
 }
 
@@ -928,7 +928,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
 }
 
 Error GlobalISelEmitter::importComplexPatternOperandMatcher(
-    OperandMatcher &OM, Record *R, unsigned &TempOpIdx) const {
+    OperandMatcher &OM, const Record *R, unsigned &TempOpIdx) const {
   const auto &ComplexPattern = ComplexPatternEquivs.find(R);
   if (ComplexPattern == ComplexPatternEquivs.end())
     return failedImport("SelectionDAG ComplexPattern (" + R->getName() +
@@ -1508,7 +1508,7 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
 
 Expected<action_iterator> GlobalISelEmitter::createInstructionRenderer(
     action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst) {
-  Record *DstOp = Dst.getOperator();
+  const Record *DstOp = Dst.getOperator();
   if (!DstOp->isSubClassOf("Instruction")) {
     if (DstOp->isSubClassOf("ValueType"))
       return failedImport(
@@ -1813,7 +1813,7 @@ GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode &N) {
   // just take the first one).
   if (N.getNumTypes() < 1)
     return std::nullopt;
-  Record *OpRec = N.getOperator();
+  const Record *OpRec = N.getOperator();
 
   // We only want instructions.
   if (!OpRec->isSubClassOf("Instruction"))
@@ -2011,7 +2011,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   }
 
   // Start with the defined operands (i.e., the results of the root operator).
-  Record *DstOp = Dst.getOperator();
+  const Record *DstOp = Dst.getOperator();
   if (!DstOp->isSubClassOf("Instruction"))
     return failedImport("Pattern operator isn't an instruction");
 
diff --git a/llvm/utils/TableGen/InstrDocsEmitter.cpp b/llvm/utils/TableGen/InstrDocsEmitter.cpp
index f948540..f53428e 100644
--- a/llvm/utils/TableGen/InstrDocsEmitter.cpp
+++ b/llvm/utils/TableGen/InstrDocsEmitter.cpp
@@ -73,7 +73,7 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) {
   OS << "\n";
 
   for (const CodeGenInstruction *II : Target.getInstructionsByEnumValue()) {
-    Record *Inst = II->TheDef;
+    const Record *Inst = II->TheDef;
 
     // Don't print the target-independent instructions.
     if (II->Namespace == "TargetOpcode")
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 5fd5914..4e2138d 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -721,9 +721,9 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
 }
 
 static std::string
-getNameForFeatureBitset(const std::vector<Record *> &FeatureBitset) {
+getNameForFeatureBitset(ArrayRef<const Record *> FeatureBitset) {
   std::string Name = "CEFBS";
-  for (const auto &Feature : FeatureBitset)
+  for (const Record *Feature : FeatureBitset)
     Name += ("_" + Feature->getName()).str();
   return Name;
 }
@@ -731,7 +731,7 @@ getNameForFeatureBitset(const std::vector<Record *> &FeatureBitset) {
 void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
                                            const CodeGenTarget &Target) {
   const auto &All = SubtargetFeatureInfo::getAll(Records);
-  std::map<Record *, SubtargetFeatureInfo, LessRecordByID> SubtargetFeatures;
+  SubtargetFeatureInfoMap SubtargetFeatures;
   SubtargetFeatures.insert(All.begin(), All.end());
 
   OS << "#if (defined(ENABLE_INSTR_PREDICATE_VERIFIER) && !defined(NDEBUG)) "
@@ -752,18 +752,19 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
   SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures(
       Target.getName(), "", "computeAvailableFeatures", SubtargetFeatures, OS);
 
-  std::vector<std::vector<Record *>> FeatureBitsets;
+  std::vector<std::vector<const Record *>> FeatureBitsets;
   for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
     FeatureBitsets.emplace_back();
-    for (Record *Predicate : Inst->TheDef->getValueAsListOfDefs("Predicates")) {
+    for (const Record *Predicate :
+         Inst->TheDef->getValueAsListOfDefs("Predicates")) {
       const auto &I = SubtargetFeatures.find(Predicate);
       if (I != SubtargetFeatures.end())
         FeatureBitsets.back().push_back(I->second.TheDef);
     }
   }
 
-  llvm::sort(FeatureBitsets, [&](const std::vector<Record *> &A,
-                                 const std::vector<Record *> &B) {
+  llvm::sort(FeatureBitsets, [&](const std::vector<const Record *> &A,
+                                 const std::vector<const Record *> &B) {
     if (A.size() < B.size())
       return true;
     if (A.size() > B.size())
@@ -806,7 +807,8 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
   for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
     OS << "    CEFBS";
     unsigned NumPredicates = 0;
-    for (Record *Predicate : Inst->TheDef->getValueAsListOfDefs("Predicates")) {
+    for (const Record *Predicate :
+         Inst->TheDef->getValueAsListOfDefs("Predicates")) {
       const auto &I = SubtargetFeatures.find(Predicate);
       if (I != SubtargetFeatures.end()) {
         OS << '_' << I->second.TheDef->getName();
@@ -890,7 +892,8 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
 void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS,
                                             StringRef TargetName,
                                             bool ExpandDefinition) {
-  RecVec TIIPredicates = Records.getAllDerivedDefinitions("TIIPredicate");
+  ArrayRef<const Record *> TIIPredicates =
+      Records.getAllDerivedDefinitions("TIIPredicate");
   if (TIIPredicates.empty())
     return;
 
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 0fc930b..8952c8e 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -446,7 +446,7 @@ void X86FoldTablesEmitter::addEntryWithFlags(FoldTable &Table,
   assert((IsManual || Table.find(RegInst) == Table.end()) &&
          "Override entry unexpectedly");
   X86FoldTableEntry Result = X86FoldTableEntry(RegInst, MemInst);
-  Record *RegRec = RegInst->TheDef;
+  const Record *RegRec = RegInst->TheDef;
   Result.NoReverse = S & TB_NO_REVERSE;
   Result.NoForward = S & TB_NO_FORWARD;
   Result.FoldLoad = S & TB_FOLDED_LOAD;
@@ -537,8 +537,8 @@ void X86FoldTablesEmitter::updateTables(const CodeGenInstruction *RegInst,
                                         uint16_t S, bool IsManual,
                                         bool IsBroadcast) {
 
-  Record *RegRec = RegInst->TheDef;
-  Record *MemRec = MemInst->TheDef;
+  const Record *RegRec = RegInst->TheDef;
+  const Record *MemRec = MemInst->TheDef;
   unsigned MemOutSize = MemRec->getValueAsDag("OutOperandList")->getNumArgs();
   unsigned RegOutSize = RegRec->getValueAsDag("OutOperandList")->getNumArgs();
   unsigned MemInSize = MemRec->getValueAsDag("InOperandList")->getNumArgs();
diff --git a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
index fe8a3f5..acf9e7a 100644
--- a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
@@ -35,6 +35,7 @@ static_library("Target") {
   sources = [
     "ABI.cpp",
     "AssertFrameRecognizer.cpp",
+    "CoreFileMemoryRanges.cpp",
     "DynamicRegisterInfo.cpp",
     "ExecutionContext.cpp",
     "InstrumentationRuntime.cpp",
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 2fda091..aceb9d0 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -152,7 +152,7 @@ public:
   /// This function adds a DENY entry.
   void denyDialect(StringRef dialectNamespace) {
     Entry::FilterFn filterFn = [=](Operation *op) {
-      return op->getDialect()->getNamespace() == dialectNamespace;
+      return op->getName().getDialectNamespace() == dialectNamespace;
     };
     entries.push_back(Entry{filterFn, Entry::FilterType::DENY});
   }
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt
index 12b4fc4..1ee105f 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt
@@ -3,8 +3,8 @@ add_mlir_doc(TosaOps TosaOps Dialects/ -gen-op-doc)
 add_mlir_interface(TosaInterfaces)
 
 set(LLVM_TARGET_DEFINITIONS TosaOps.td)
-mlir_tablegen(TosaAttributes.h.inc -gen-attrdef-decls)
-mlir_tablegen(TosaAttributes.cpp.inc -gen-attrdef-defs)
+mlir_tablegen(TosaAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=tosa)
+mlir_tablegen(TosaAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=tosa)
 add_public_tablegen_target(MLIRTosaAttributesIncGen)
 
 set(LLVM_TARGET_DEFINITIONS TosaDialectBytecode.td)
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index ab6daa3..63572f2 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -73,7 +73,6 @@ def Tosa_AvgPool2dOp : Tosa_InferShapedTypeOp<"avg_pool2d"> {
 
   let arguments = (ins
     Tosa_Tensor4D:$input,
-
     Tosa_IntArrayAttr2:$kernel,
     Tosa_IntArrayAttr2:$stride,
     Tosa_IntArrayAttr4:$pad,
@@ -102,9 +101,8 @@ def Tosa_Conv2DOp : Tosa_InferShapedTypeOp<"conv2d"> {
 
   let arguments = (ins
     Tosa_Tensor4D:$input,
-    4DTensorOf<[Tosa_Weight]>:$weight,
+    TosaTensorRankOf<[Tosa_Weight], [4]>:$weight,
     Tosa_Tensor1D:$bias,
-
     Tosa_IntArrayAttr4:$pad,
     Tosa_IntArrayAttr2:$stride,
     Tosa_IntArrayAttr2:$dilation,
@@ -132,9 +130,8 @@ def Tosa_Conv3DOp : Tosa_InferShapedTypeOp<"conv3d"> {
 
   let arguments = (ins
     Tosa_Tensor5D:$input,
-    TensorRankOf<[Tosa_Weight], [5]>:$weight,
+    TosaTensorRankOf<[Tosa_Weight], [5]>:$weight,
     Tosa_Tensor1D:$bias,
-
     Tosa_IntArrayAttr6:$pad,
     Tosa_IntArrayAttr3:$stride,
     Tosa_IntArrayAttr3:$dilation,
@@ -163,9 +160,8 @@ def Tosa_DepthwiseConv2DOp : Tosa_InferShapedTypeOp<"depthwise_conv2d"> {
 
   let arguments = (ins
     Tosa_Tensor4D:$input,
-    4DTensorOf<[Tosa_Weight]>:$weight,
+    TosaTensorRankOf<[Tosa_Weight], [4]>:$weight,
     Tosa_Tensor1D:$bias,
-
     Tosa_IntArrayAttr4:$pad,
     Tosa_IntArrayAttr2:$stride,
     Tosa_IntArrayAttr2:$dilation,
@@ -232,7 +228,7 @@ def Tosa_FullyConnectedOp : Tosa_InferShapedTypeOp<"fully_connected"> {
 
   let arguments = (ins
     Tosa_Tensor2D:$input,
-    2DTensorOf<[Tosa_Weight]>:$weight,
+    TosaTensorRankOf<[Tosa_Weight], [2]>:$weight,
     Tosa_Tensor1D:$bias,
     OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info
   );
@@ -347,9 +343,8 @@ def Tosa_TransposeConv2DOp : Tosa_InferShapedTypeOp<"transpose_conv2d"> {
 
   let arguments = (ins
     Tosa_Tensor4D:$input,
-    4DTensorOf<[Tosa_Weight]>:$filter,
+    TosaTensorRankOf<[Tosa_Weight], [4]>:$filter,
     Tosa_Tensor1D:$bias,
-
     Tosa_IntArrayAttr4:$out_pad,
     Tosa_IntArrayAttr2:$stride,
     Tosa_IntArrayAttrUpto4:$out_shape,
@@ -641,12 +636,12 @@ def Tosa_LogicalAndOp : Tosa_ElementwiseOp<"logical_and", [
   }];
 
   let arguments = (ins
-    I1Tensor:$input1,
-    I1Tensor:$input2
+    Tosa_I1Tensor:$input1,
+    Tosa_I1Tensor:$input2
   );
 
   let results = (outs
-    I1Tensor:$z
+    Tosa_I1Tensor:$z
   );
 }
 
@@ -708,12 +703,12 @@ def Tosa_LogicalOrOp : Tosa_ElementwiseOp<"logical_or", [
   }];
 
   let arguments = (ins
-    I1Tensor:$input1,
-    I1Tensor:$input2
+    Tosa_I1Tensor:$input1,
+    Tosa_I1Tensor:$input2
   );
 
   let results = (outs
-    I1Tensor:$z
+    Tosa_I1Tensor:$z
   );
 }
 
@@ -731,12 +726,12 @@ def Tosa_LogicalXorOp : Tosa_ElementwiseOp<"logical_xor", [
   }];
 
   let arguments = (ins
-    I1Tensor:$input1,
-    I1Tensor:$input2
+    Tosa_I1Tensor:$input1,
+    Tosa_I1Tensor:$input2
   );
 
   let results = (outs
-    I1Tensor:$z
+    Tosa_I1Tensor:$z
   );
 }
 
@@ -1085,11 +1080,11 @@ def Tosa_LogicalNotOp : Tosa_ElementwiseOp<"logical_not",
   }];
 
   let arguments = (ins
-    I1Tensor:$input1
+    Tosa_I1Tensor:$input1
   );
 
   let results = (outs
-    I1Tensor:$output
+    Tosa_I1Tensor:$output
   );
 }
 
@@ -1208,7 +1203,7 @@ def Tosa_SelectOp : Tosa_ElementwiseOp<"select"> {
   }];
 
   let arguments = (ins
-    I1Tensor:$pred,
+    Tosa_I1Tensor:$pred,
     Tosa_Tensor:$on_true,
     Tosa_Tensor:$on_false
   );
@@ -1249,7 +1244,7 @@ def Tosa_EqualOp : Tosa_ElementwiseOp<"equal", [
   );
 
   let results = (outs
-    I1Tensor:$output
+    Tosa_I1Tensor:$output
   );
 
   let extraClassDeclaration = [{
@@ -1277,7 +1272,7 @@ def Tosa_GreaterOp : Tosa_ElementwiseOp<"greater", [SameOperandsElementType]> {
   );
 
   let results = (outs
-    I1Tensor:$output
+    Tosa_I1Tensor:$output
   );
 
   let hasFolder = 1;
@@ -1300,7 +1295,7 @@ def Tosa_GreaterEqualOp : Tosa_ElementwiseOp<"greater_equal",
   );
 
   let results = (outs
-    I1Tensor:$output
+    Tosa_I1Tensor:$output
   );
 
   let hasFolder = 1;
@@ -1721,7 +1716,7 @@ def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
 
   let arguments = (ins
     Tosa_Tensor:$input1,
-    Tosa_Int32Or64Tensor:$perms
+    Tosa_Int32Tensor:$perms
   );
 
   let results = (
@@ -1729,7 +1724,7 @@ def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
   );
 
   let extraClassDeclaration = [{
-    LogicalResult getConstantPerms(llvm::SmallVector<int64_t> &perms);
+    LogicalResult getConstantPerms(llvm::SmallVector<int32_t> &perms);
   }];
 
   let hasCanonicalizer = 1;
@@ -1755,7 +1750,7 @@ def Tosa_GatherOp : Tosa_InferShapedTypeOp<"gather"> {
 
   let arguments = (ins
     Tosa_Tensor3D:$values,
-    2DTensorOf<[Tosa_Int32]>:$indices
+    TosaTensorRankOf<[Tosa_Int32], [2]>:$indices
   );
 
   let results = (outs
@@ -1776,7 +1771,7 @@ def Tosa_ScatterOp : Tosa_InferShapedTypeOp<"scatter"> {
 
   let arguments = (ins
     Tosa_Tensor3D:$values_in,
-    2DTensorOf<[Tosa_Int32]>:$indices,
+    TosaTensorRankOf<[Tosa_Int32], [2]>:$indices,
     Tosa_Tensor3D:$input
   );
 
@@ -1947,10 +1942,11 @@ def Tosa_ConstOp : Tosa_Op<"const", [ConstantLike, Pure,
   );
 
   let results = (outs
-    TensorOf<[AnyTypeOf<[Tosa_AnyNumber]>]>:$output
+    TosaTensorOf<[AnyTypeOf<[Tosa_AnyNumber]>]>:$output
   );
 
   let hasFolder = 1;
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2054,7 +2050,7 @@ def Tosa_IfOp : Tosa_Op<"cond_if",
   }];
 
   let arguments = (ins
-    I1Tensor:$cond,
+    Tosa_I1Tensor:$cond,
     Variadic<Tosa_Tensor>:$inputs
   );
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index 14fc9c7..c3a0128 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -83,57 +83,82 @@ def Tosa_Weight : AnyTypeOf<[Tosa_Int4, Tosa_Int8,
                              Tosa_QuantizedInt, AnyFloat]>;
 
 //===----------------------------------------------------------------------===//
+// TOSA Tensor Conformance
+//===----------------------------------------------------------------------===//
+
+def HasNo0Dimensions : And<[
+    IsRankedTensorTypePred,
+    CPred<"::llvm::all_of(::llvm::cast<::mlir::RankedTensorType>($_self).getShape(), [](auto v) { return v != 0; })">]>;
+
+class TosaTensorOf<
+    list<Type> allowedTypes, string summary = "tosa-conformant tensor">
+    : TensorOf<allowedTypes, [Or<[HasNo0Dimensions, IsUnrankedTensorTypePred]>], summary>;
+
+class TosaRankedTensorOf<
+    list<Type> allowedTypes, list<Pred> preds = [], string summary = "tosa-conformant ranked tensor">
+    : RankedTensorOf<allowedTypes, !listconcat([HasNo0Dimensions], preds), summary>;
+
+class TosaUnrankedTensorOf<list<Type> allowedTypes, list<Pred> preds = [], string summary = "tosa-conformant unranked tensor">
+    : UnrankedTensorOf<allowedTypes, preds, summary>;
+
+class TosaTensorRankOf<list<Type> allowedTypes, list<int> ranks>
+    : TosaRankedTensorOf<allowedTypes,
+      [HasAnyRankOfPred<ranks>],
+      !interleave(!foreach(rank, ranks, rank # "D"), "/") # " tensor">;
+
+//===----------------------------------------------------------------------===//
 // Tensor types
 //===----------------------------------------------------------------------===//
 
-def Tosa_Int32Tensor : TensorOf<[Tosa_Int32]>;
-def Tosa_Int32Or64Tensor : TensorOf<[Tosa_Int32Or64]>;
+def Tosa_I1Tensor : TosaTensorOf<[I1]>;
+def Tosa_Int32Tensor : TosaTensorOf<[Tosa_Int32]>;
+def Tosa_Int32Or64Tensor :TosaTensorOf<[Tosa_Int32Or64]>;
 
-def Tosa_FloatTensor : TensorOf<[AnyFloat]>;
+def Tosa_FloatTensor : TosaTensorOf<[AnyFloat]>;
 
 // Either ranked or unranked tensor of TOSA supported element types.
-def Tosa_Tensor : TensorOf<[Tosa_AnyNumber]>;
+def Tosa_Tensor : TosaTensorOf<[Tosa_AnyNumber]>;
 
 // Must be ranked but no further constraints
-def Tosa_RankedTensor : RankedTensorOf<[Tosa_AnyNumber]>;
+def Tosa_RankedTensor : TosaRankedTensorOf<[Tosa_AnyNumber]>;
 
 // Any tensor element type allowed in Tosa ops.
 def Tosa_ElementType : Type<Or<[Tosa_Int.predicate, Tosa_QuantizedInt.predicate,
                                 AnyFloat.predicate]>, "tosa.dtype">;
 
 class Tosa_TensorOfOrNone<list<Type> allowedTypes, string description = ""> :
-  AnyTypeOf<[TensorOf<allowedTypes>, NoneType], description>;
+  AnyTypeOf<[TosaTensorOf<allowedTypes>, NoneType], description>;
 
 //===----------------------------------------------------------------------===//
 // Tensor types with constrained ranks.
 //===----------------------------------------------------------------------===//
 
 // Rank-0 (scalar) tensor
-def Tosa_ScalarTensor : TensorRankOf<[Tosa_AnyNumber], [0]>;
+def Tosa_ScalarTensor : TosaTensorRankOf<[Tosa_AnyNumber], [0]>;
 
 // We include unranked tensors as a supported type for all possible tosa
 // Tensors as unranked does not guarantee invalid. If unranked tensors exist
 // they should be shape propagate used Tosa's shape inference pass and verified
 // to not include any remaining unranked tensors.
-def Tosa_UnrankedTensor : UnrankedTensorOf<[Tosa_AnyNumber]>;
+def Tosa_UnrankedTensor : TosaUnrankedTensorOf<[Tosa_AnyNumber]>;
 
-def Tosa_Tensor1D : AnyTypeOf<[Tosa_UnrankedTensor, 1DTensorOf<[Tosa_AnyNumber]>], "1-d tensor", "::mlir::TensorType">;
-def Tosa_Tensor2D : AnyTypeOf<[Tosa_UnrankedTensor, 2DTensorOf<[Tosa_AnyNumber]>], "2-d tensor", "::mlir::TensorType">;
-def Tosa_Tensor3D : AnyTypeOf<[Tosa_UnrankedTensor, 3DTensorOf<[Tosa_AnyNumber]>], "3-d tensor", "::mlir::TensorType">;
-def Tosa_Tensor4D : AnyTypeOf<[Tosa_UnrankedTensor, 4DTensorOf<[Tosa_AnyNumber]>], "4-d tensor", "::mlir::TensorType">;
-def Tosa_Tensor5D : AnyTypeOf<[Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [5]>], "5-d tensor", "::mlir::TensorType">;
+def Tosa_Tensor1D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [1]>], "1-d tosa-conformant tensor", "::mlir::TensorType">;
+def Tosa_Tensor2D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [2]>], "2-d tosa-conformant tensor", "::mlir::TensorType">;
+def Tosa_Tensor3D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [3]>], "3-d tosa-conformant tensor", "::mlir::TensorType">;
+def Tosa_Tensor4D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [4]>], "4-d tosa-conformant tensor", "::mlir::TensorType">;
+def Tosa_Tensor5D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [5]>], "5-d tosa-conformant tensor", "::mlir::TensorType">;
 
 // Ranked tensors up to given rank.
 def Tosa_Tensor1Dto4D : AnyTypeOf<[
-  Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [1,2,3,4]>]>;
+  Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [1,2,3,4]>]>;
 def Tosa_Tensor1Dto6D : AnyTypeOf<[
-  Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [1,2,3,4,5,6]>]>;
+  Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [1,2,3,4,5,6]>]>;
 
 def Tosa_TensorUpto4D : AnyTypeOf<[
-  Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [0,1,2,3,4]>]>;
+  Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [0,1,2,3,4]>]>;
 
 def Tosa_Int32TensorUpto4D : AnyTypeOf<[
-  Tosa_UnrankedTensor, TensorRankOf<[Tosa_Int32], [0,1,2,3,4]>]>;
+  Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_Int32], [0,1,2,3,4]>]>;
 
 //===----------------------------------------------------------------------===//
 // Generic scalar, vector, or tensor of a particular type.
@@ -142,7 +167,7 @@ def Tosa_Int32TensorUpto4D : AnyTypeOf<[
 class Tosa_TypeLike<list<Type> types, string description = ""> : TypeConstraint<Or<[
      AnyTypeOf<types>.predicate,
      VectorOf<types>.predicate,
-     TensorOf<types>.predicate]>,
+     TosaTensorOf<types>.predicate]>,
      description>;
 
 def Tosa_IntLike : Tosa_TypeLike<[Tosa_Int], "signless-integer-like">;
diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
index ef40b34..90fea1f 100644
--- a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
+++ b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
@@ -216,6 +216,19 @@ TosaOp CreateOpAndInferShape(PatternRewriter &rewriter, Location loc,
   return CreateOpAndInferShape<TosaOp>(builder, resultTy, args...);
 }
 
+// Apply an int32_t permutation to some input, that should be of the same
+// size as perms. Perms should contain some permutation of 0 - perms.size() - 1.
+template <typename T>
+SmallVector<T> applyTOSAPermutation(ArrayRef<T> input,
+                                    ArrayRef<int32_t> perms) {
+  SmallVector<T> permuted;
+  size_t N = input.size();
+  permuted.resize_for_overwrite(N);
+  for (size_t i = 0; i < N; i++)
+    permuted[i] = input[perms[i]];
+  return permuted;
+}
+
 } // namespace tosa
 } // namespace mlir
 
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 93e8b08..2992671 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -329,10 +329,9 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
   target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
   target.addLegalDialect<ROCDL::ROCDLDialect>();
   target.addIllegalDialect<gpu::GPUDialect>();
-  target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
-                      LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,
-                      LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,
-                      LLVM::SqrtOp>();
+  target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
+                      LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
+                      LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
 
   // TODO: Remove once we support replacing non-root ops.
   target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index 77c3d2e..fe53b49 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -313,7 +313,7 @@ public:
         // convolution operation.
         // TODO(suderman): See if this can be efficiently folded - check whether
         // the input is used anywhere else, if not fold the constant.
-        SmallVector<int64_t> weightPerm;
+        SmallVector<int32_t> weightPerm;
         for (int i = 1; i < resultTy.getRank(); i++)
           weightPerm.push_back(i);
         weightPerm.push_back(0);
@@ -321,7 +321,7 @@ public:
         SmallVector<int64_t> newWeightShape;
         for (auto dim : weightPerm)
           newWeightShape.push_back(weightShape[dim]);
-        auto weightPermAttr = rewriter.getI64TensorAttr(weightPerm);
+        auto weightPermAttr = rewriter.getI32TensorAttr(weightPerm);
         Value weightPermValue =
             rewriter.create<arith::ConstantOp>(loc, weightPermAttr);
         Type newWeightTy =
@@ -337,7 +337,7 @@ public:
     if (5 == inputTy.getRank()) {
       // TODO(suderman): See if this can be efficiently folded - check whether
       // the input is used anywhere else, if not fold the constant.
-      SmallVector<int64_t> weightPerm;
+      SmallVector<int32_t> weightPerm;
       for (int i = 1; i < resultTy.getRank(); i++)
         weightPerm.push_back(i);
       weightPerm.push_back(0);
@@ -345,7 +345,7 @@ public:
       SmallVector<int64_t> newWeightShape;
       for (auto dim : weightPerm)
         newWeightShape.push_back(weightShape[dim]);
-      auto weightPermAttr = rewriter.getI64TensorAttr(weightPerm);
+      auto weightPermAttr = rewriter.getI32TensorAttr(weightPerm);
       Value weightPermValue =
           rewriter.create<arith::ConstantOp>(loc, weightPermAttr);
       Type newWeightTy =
@@ -1040,22 +1040,25 @@ public:
 
   LogicalResult matchAndRewrite(tosa::TransposeOp op,
                                 PatternRewriter &rewriter) const final {
-    SmallVector<int64_t> constantPerms;
+    SmallVector<int32_t> constantPerms;
     if (failed(op.getConstantPerms(constantPerms)))
       return failure();
 
     Location loc = op.getLoc();
-    // The verifier should have made sure we have a valid permutation tensor.
-    assert(isPermutationVector(constantPerms) && "Expected valid permutation");
+    // The verifier should have made sure we have a valid TOSA permutation
+    // tensor. isPermutationVector doesn't actually check the TOSA perms we
+    // expect.
     SmallVector<OpFoldResult> inputSizes =
         tensor::getMixedSizes(rewriter, loc, op.getInput1());
     auto permutedSizes =
-        applyPermutation<OpFoldResult>(inputSizes, constantPerms);
+        applyTOSAPermutation<OpFoldResult>(inputSizes, constantPerms);
 
     auto permutedInit = rewriter.create<tensor::EmptyOp>(
         loc, permutedSizes, op.getInput1().getType().getElementType());
     rewriter.replaceOpWithNewOp<linalg::TransposeOp>(
-        op, op.getInput1(), permutedInit, constantPerms);
+        op, op.getInput1(), permutedInit,
+        llvm::to_vector(llvm::map_range(
+            constantPerms, [](int32_t v) -> int64_t { return v; })));
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index da9a93f..03876a7 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -88,7 +88,7 @@ struct ConsolidateTransposeOptimization
       return rewriter.notifyMatchFailure(transposeOp,
                                          "input must be transpose operation");
 
-    SmallVector<int64_t> transposePerms, innerTransposePerms;
+    SmallVector<int32_t> transposePerms, innerTransposePerms;
     if (transposeOp.getConstantPerms(transposePerms).failed())
       return rewriter.notifyMatchFailure(transposeOp,
                                          "transpose perms must be constant");
@@ -497,8 +497,10 @@ OpFoldResult AddOp::fold(FoldAdaptor adaptor) {
     return {};
 
   auto resultETy = resultTy.getElementType();
-  auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
-  auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+  auto lhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+  auto rhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
 
   if (lhsTy == resultTy && isSplatZero(resultETy, rhsAttr))
     return getInput1();
@@ -536,8 +538,10 @@ OpFoldResult IntDivOp::fold(FoldAdaptor adaptor) {
 
   // IntDivOp inputs must be integer type, no need to check for quantized type
   auto resultETy = resultTy.getElementType();
-  auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
-  auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+  auto lhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+  auto rhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
   if (lhsAttr && lhsAttr.isSplat()) {
     if (llvm::isa<IntegerType>(resultETy) &&
         lhsAttr.getSplatValue<APInt>().isZero())
@@ -605,10 +609,13 @@ OpFoldResult MulOp::fold(FoldAdaptor adaptor) {
     return {};
 
   auto resultETy = resultTy.getElementType();
-  auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
-  auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+  auto lhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+  auto rhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
 
   const int64_t shift = llvm::isa<IntegerType>(resultETy) ? getShift() : 0;
+
   if (rhsTy == resultTy) {
     if (isSplatZero(resultETy, lhsAttr))
       return lhsAttr.resizeSplat(resultTy);
@@ -638,8 +645,10 @@ OpFoldResult SubOp::fold(FoldAdaptor adaptor) {
     return {};
 
   auto resultETy = resultTy.getElementType();
-  auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
-  auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+  auto lhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+  auto rhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
 
   if (lhsTy == resultTy && isSplatZero(resultETy, rhsAttr))
     return getInput1();
@@ -681,8 +690,10 @@ struct APIntFoldGreaterEqual {
 
 OpFoldResult GreaterOp::fold(FoldAdaptor adaptor) {
   auto resultTy = llvm::dyn_cast<RankedTensorType>(getType());
-  auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
-  auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+  auto lhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+  auto rhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
 
   if (!lhsAttr || !rhsAttr)
     return {};
@@ -693,8 +704,10 @@ OpFoldResult GreaterOp::fold(FoldAdaptor adaptor) {
 
 OpFoldResult GreaterEqualOp::fold(FoldAdaptor adaptor) {
   auto resultTy = llvm::dyn_cast<RankedTensorType>(getType());
-  auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
-  auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+  auto lhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+  auto rhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
 
   if (!lhsAttr || !rhsAttr)
     return {};
@@ -706,8 +719,10 @@ OpFoldResult GreaterEqualOp::fold(FoldAdaptor adaptor) {
 
 OpFoldResult EqualOp::fold(FoldAdaptor adaptor) {
   auto resultTy = llvm::dyn_cast<RankedTensorType>(getType());
-  auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
-  auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+  auto lhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+  auto rhsAttr =
+      llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
   Value lhs = getInput1();
   Value rhs = getInput2();
   auto lhsTy = llvm::cast<ShapedType>(lhs.getType());
@@ -838,14 +853,16 @@ OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) {
     return {};
 
   // reshape(const(x)) -> const(reshape-attr(x))
-  if (auto operand = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) {
+  if (auto operand =
+          llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) {
     // Constants must have static shape.
     if (!outputTy.hasStaticShape())
       return {};
 
     // Okay to duplicate splat constants.
     if (operand.isSplat())
-      return SplatElementsAttr::get(outputTy, operand.getSplatValue<Attribute>());
+      return SplatElementsAttr::get(outputTy,
+                                    operand.getSplatValue<Attribute>());
 
     // Don't duplicate other constants.
     if (!getInput1().hasOneUse())
@@ -905,7 +922,8 @@ OpFoldResult ReverseOp::fold(FoldAdaptor adaptor) {
   auto operand = getInput();
   auto operandTy = llvm::cast<ShapedType>(operand.getType());
   auto axis = getAxis();
-  auto operandAttr = llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getInput());
+  auto operandAttr =
+      llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getInput());
   if (operandAttr)
     return operandAttr;
 
@@ -954,7 +972,8 @@ OpFoldResult tosa::SelectOp::fold(FoldAdaptor adaptor) {
   if (getOnTrue() == getOnFalse())
     return getOnTrue();
 
-  auto predicate = llvm::dyn_cast_if_present<DenseIntElementsAttr>(adaptor.getPred());
+  auto predicate =
+      llvm::dyn_cast_if_present<DenseIntElementsAttr>(adaptor.getPred());
   if (!predicate)
     return {};
 
@@ -975,7 +994,8 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
   auto resultTy = llvm::cast<ShapedType>(getType());
 
   // Transposing splat values just means reshaping.
-  if (auto input = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) {
+  if (auto input =
+          llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) {
     if (input.isSplat() && resultTy.hasStaticShape() &&
         input.getType().getElementType() == resultTy.getElementType())
       return input.reshape(resultTy);
@@ -986,11 +1006,11 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
     return {};
 
   // Transpose is not the identity transpose.
-  SmallVector<int64_t> perms;
+  SmallVector<int32_t> perms;
   if (getConstantPerms(perms).failed())
     return {};
 
-  if (!llvm::equal(llvm::seq<int64_t>(0, perms.size()), perms))
+  if (!llvm::equal(llvm::seq<int32_t>(0, perms.size()), perms))
     return {};
 
   return getInput1();
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index d93db1b..0d0241f 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -204,22 +204,6 @@ void mlir::tosa::printTypeOrAttr(OpAsmPrinter &p, Operation *op, TypeAttr type,
 // TOSA Operator Verifiers.
 //===----------------------------------------------------------------------===//
 
-static bool hasZeroDimension(ShapedType shapedType) {
-  if (!shapedType.hasRank())
-    return false;
-
-  auto rank = shapedType.getRank();
-
-  for (int i = 0; i < rank; i++) {
-    if (shapedType.isDynamicDim(i))
-      continue;
-    if (shapedType.getDimSize(i) == 0)
-      return true;
-  }
-
-  return false;
-}
-
 template <typename T>
 static LogicalResult verifyConvOp(T op) {
   // All TOSA conv ops have an input() and weight().
@@ -236,10 +220,6 @@ static LogicalResult verifyConvOp(T op) {
     return failure();
   }
 
-  if (hasZeroDimension(inputType))
-    return op.emitOpError() << "tensor has a dimension with size zero. Each "
-                               "dimension of a tensor must have size >= 1";
-
   auto inputEType = inputType.getElementType();
   auto weightEType = weightType.getElementType();
 
@@ -262,6 +242,29 @@ static LogicalResult verifyConvOp(T op) {
                    "allowed for float type");
     return failure();
   }
+  return success();
+}
+
+LogicalResult tosa::ConstOp::verify() {
+
+  auto attrType = llvm::dyn_cast<TensorType>(getValueAttr().getType());
+  auto outputType = llvm::dyn_cast<TensorType>(getOutput().getType());
+
+  if (!attrType || !outputType) {
+    emitOpError("expected tensors for attr/result type");
+    return failure();
+  }
+
+  if (auto result = llvm::dyn_cast<mlir::quant::QuantizedType>(
+          outputType.getElementType())) {
+    if (result.getStorageType() == attrType.getElementType())
+      return success();
+  }
+
+  if (attrType.getElementType() != outputType.getElementType()) {
+    emitOpError("expected same attr/result element types");
+    return failure();
+  }
 
   return success();
 }
@@ -283,9 +286,6 @@ LogicalResult tosa::ArgMaxOp::verify() {
 
 LogicalResult tosa::AvgPool2dOp::verify() {
   auto inputType = llvm::cast<ShapedType>(getInput().getType());
-  if (hasZeroDimension(inputType))
-    return emitOpError() << "tensor has a dimension with size zero. Each "
-                            "dimension of a tensor must have size >= 1";
 
   auto inputETy = inputType.getElementType();
   auto resultETy = llvm::cast<ShapedType>(getType()).getElementType();
@@ -341,9 +341,9 @@ LogicalResult tosa::ClampOp::verify() {
   if (inputETy != outputETy)
     return emitOpError("input/output element types are incompatible.");
 
-  // if input datatype is float, check that the two min/max_fp attributes share
-  // the same type and that their type is either the same of the input's
-  // datatype, or a float type whose bitwidth > input datatype bitwidth
+  // If input datatype is float, check that the two min/max_fp attributes
+  // share the same type and that their type is either the same of the input's
+  // datatype, or a float type whose bitwidth > input datatype bitwidth.
   if (!inputETy.isInteger(dataTypeBitWidth)) {
     if (((maxFpType != minFpType) ||
          (maxFpType != inputETy && maxFpType.getIntOrFloatBitWidth() <=
@@ -383,7 +383,8 @@ static void buildConvOpWithQuantInfo(OpBuilder &builder, OperationState &result,
   }
 }
 
-/// Handles tosa.transpose_conv2d which has outpad and output shape attributes.
+/// Handles tosa.transpose_conv2d which has outpad and output shape
+/// attributes.
 static void buildTransConvOpWithQuantInfo(
     OpBuilder &builder, OperationState &result, Type outputType, Value input,
     Value weight, Value bias, DenseI64ArrayAttr outpad,
@@ -420,9 +421,9 @@ static void buildFCOpWithQuantInfo(OpBuilder &builder, OperationState &result,
   }
 }
 
-/// The tosa.matmul op is also intended to be generated where a fully_connected
-/// op must be constructed where the weight is not a constant. In this case,
-/// the fully_connected op must be expressed using matmul.
+/// The tosa.matmul op is also intended to be generated where a
+/// fully_connected op must be constructed where the weight is not a constant.
+/// In this case, the fully_connected op must be expressed using matmul.
 /// TODO: Add link to the leglization document explaining this.
 static void buildMatMulOpWithQuantInfo(OpBuilder &builder,
                                        OperationState &result, Type outputType,
@@ -457,9 +458,9 @@ static void buildMatMulOpWithQuantInfo(OpBuilder &builder,
   }
 }
 
-/// Both the tosa.avg_pool2d and unary ops use the same UnaruOpQuantizationAttr
-/// but avg_pool operator has its own builder as it has additional parameters
-/// not part of the unary ops.
+/// Both the tosa.avg_pool2d and unary ops use the same
+/// UnaruOpQuantizationAttr but avg_pool operator has its own builder as it
+/// has additional parameters not part of the unary ops.
 static void
 buildAvgPool2dOpWithQuantInfo(OpBuilder &builder, OperationState &result,
                               Type outputType, Value input,
@@ -526,8 +527,8 @@ static LogicalResult resolveBroadcastShape(const ValueShapeRange &operands,
   for (int i = 0, e = operands.size(); i != e; ++i) {
     auto shape = operands.getShape(i);
     if (!shape.hasRank()) {
-      // TODO(jennik): Update function to have better case handling for invalid
-      // operands and for ranked tensors.
+      // TODO(jennik): Update function to have better case handling for
+      // invalid operands and for ranked tensors.
       return failure();
     }
     outRank = std::max<int64_t>(outRank, shape.getRank());
@@ -776,8 +777,8 @@ LogicalResult tosa::PadOp::inferReturnTypeComponents(
     return success();
   }
 
-  // If the input rank is unknown we can info the output rank using the padding
-  // shape's first dim.
+  // If the input rank is unknown we can info the output rank using the
+  // padding shape's first dim.
   if (!inputShape.hasRank()) {
     if (paddingShape.isDynamicDim(0)) {
       inferredReturnShapes.push_back(ShapedTypeComponents());
@@ -1000,10 +1001,6 @@ llvm::LogicalResult tosa::ReshapeOp::verify() {
   TensorType inputType = getInput1().getType();
   RankedTensorType outputType = getType();
 
-  if (hasZeroDimension(inputType) || hasZeroDimension(outputType))
-    return emitOpError() << "tensor has a dimension with size zero. Each "
-                            "dimension of a tensor must have size >= 1";
-
   if ((int64_t)getNewShape().size() != outputType.getRank())
     return emitOpError() << "new shape does not match result rank";
 
@@ -1034,16 +1031,15 @@ llvm::LogicalResult tosa::ReshapeOp::verify() {
   return mlir::success();
 }
 
-LogicalResult tosa::TransposeOp::getConstantPerms(SmallVector<int64_t> &perms) {
+LogicalResult tosa::TransposeOp::getConstantPerms(SmallVector<int32_t> &perms) {
   // Perms must be constants.
   DenseIntElementsAttr permsAttr;
   if (!matchPattern(getPerms(), m_Constant(&permsAttr)))
     return failure();
 
-  // Transpose is not the identity transpose.
-  perms = llvm::to_vector(
-      llvm::map_range(permsAttr.getValues<APInt>(),
-                      [](const APInt &val) { return val.getSExtValue(); }));
+  perms.clear();
+  for (auto v : permsAttr.getValues<APInt>())
+    perms.push_back(v.getSExtValue());
 
   return success();
 }
@@ -1067,8 +1063,8 @@ LogicalResult tosa::TransposeOp::inferReturnTypeComponents(
     return success();
   }
 
-  // This would imply the number of permutations does not match the rank of the
-  // input which is illegal.
+  // This would imply the number of permutations does not match the rank of
+  // the input which is illegal.
   if (permsShape.getDimSize(0) != inputShape.getRank()) {
     return failure();
   }
@@ -1154,19 +1150,38 @@ LogicalResult tosa::TransposeOp::verify() {
                            << " (output rank) but got size "
                            << permType.getDimSize(0);
 
-  SmallVector<int64_t> constantPerms;
+  SmallVector<int32_t> constantPerms;
   if (succeeded(getConstantPerms(constantPerms))) {
-    // Assert that the permutation tensor has a rank, which means that the rank
-    // has been verified above.
+    // Assert that the permutation tensor has a rank, which means that the
+    // rank has been verified above.
     assert(permType.hasRank() &&
            "Unexpectedly found permutation tensor without rank");
-    if (!isPermutationVector(constantPerms))
+    if (!llvm::all_of(constantPerms,
+                      [&constantPerms](int32_t s) {
+                        return s >= 0 &&
+                               static_cast<size_t>(s) < constantPerms.size();
+                      }) ||
+        !isPermutationVector(llvm::to_vector(llvm::map_range(
+            constantPerms, [](int32_t v) -> int64_t { return v; }))))
       return emitOpError() << "expected valid permutation tensor";
 
-    if (inputType.hasRank() && !llvm::all_of(constantPerms, [&](int64_t s) {
-          return s < inputType.getRank();
-        })) {
-      return emitOpError() << "permutation must be within input bounds";
+    // Verify that the types of the input and output tensors are properly
+    // permuted.
+    if (inputType.hasRank() && outputType.hasRank()) {
+      assert(constantPerms.size() == static_cast<size_t>(inputType.getRank()) &&
+             inputType.getRank() == outputType.getRank());
+
+      for (auto i = 0; i < outputType.getRank(); i++) {
+        if (inputType.isDynamicDim(constantPerms[i]) ||
+            outputType.isDynamicDim(i))
+          continue;
+
+        if (inputType.getDimSize(constantPerms[i]) != outputType.getDimSize(i))
+          return emitOpError()
+                 << "expected output tensor dim " << i << " to match "
+                 << "input dim " << constantPerms[i] << " with value of "
+                 << inputType.getDimSize(constantPerms[i]);
+      }
     }
   }
   return success();
@@ -1175,7 +1190,7 @@ LogicalResult tosa::TransposeOp::verify() {
 LogicalResult TransposeOp::reifyResultShapes(
     OpBuilder &builder, ReifiedRankedShapedTypeDims &reifiedReturnShapes) {
 
-  SmallVector<int64_t> transposePerms;
+  SmallVector<int32_t> transposePerms;
   if (getConstantPerms(transposePerms).failed())
     return failure();
 
@@ -1184,7 +1199,7 @@ LogicalResult TransposeOp::reifyResultShapes(
 
   SmallVector<OpFoldResult> returnedDims(inputType.getRank());
   for (auto dim : transposePerms) {
-    int64_t dimInInput = transposePerms[dim];
+    int32_t dimInInput = transposePerms[dim];
     if (inputType.isDynamicDim(dimInInput))
       returnedDims[dim] =
           builder.create<tensor::DimOp>(getLoc(), input, dimInInput)
@@ -1378,8 +1393,8 @@ static LogicalResult verifyReduceOp(T op) {
           << ")";
       return failure();
     }
-    // We can only verify the reduced dimension size to be 1 if this is not the
-    // special case of output rank == 0.
+    // We can only verify the reduced dimension size to be 1 if this is not
+    // the special case of output rank == 0.
     if (outputRank != 0) {
       auto outputShape = outputType.getShape();
       if (!outputType.isDynamicDim(reduceAxis) &&
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
index 39699ee..0d55d1899 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named))" %s -verify-diagnostics -o -| FileCheck %s
-// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named{prefer-conv2d-kernel-layout-hwcf=true}))" %s -verify-diagnostics -o -| FileCheck --check-prefix="HWCF" %s
-// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,cse))" %s -verify-diagnostics -o -| FileCheck --check-prefix="CHECK-CSE" %s
+// RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named))" %s -verify-diagnostics -o -| FileCheck %s
+// RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named{prefer-conv2d-kernel-layout-hwcf=true}))" %s -verify-diagnostics -o -| FileCheck --check-prefix="HWCF" %s
+// RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,cse))" %s -verify-diagnostics -o -| FileCheck --check-prefix="CHECK-CSE" %s
 
 // CHECK-LABEL: @matmul
 func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) {
@@ -521,7 +521,7 @@ func.func @conv2d_scalar_bias_f32(%input: tensor<1x49x42x27xf32>, %weights: tens
 
 // CHECK-LABEL: @conv2d_i8
 func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi8>, %bias: tensor<28xi8>) -> () {
-  // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64>
+  // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32>
   // HWCF: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x1x1x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<1x1x27x28xi8>) permutation = [1, 2, 3, 0]
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xi32>
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xi8>) outs(%[[INIT]] : tensor<1x45x40x28xi32>) {
@@ -542,7 +542,7 @@ func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi
 
 // CHECK-LABEL: @conv2d_f32
 func.func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () {
-  // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64>
+  // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32>
   // HWCF: %[[TRANSPOSE:.+]] =  linalg.transpose ins(%arg1 : tensor<28x3x3x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x3x27x28xf32>) permutation = [1, 2, 3, 0]
 
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xf32>
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
index c2bbfd5..73da281 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
@@ -24,7 +24,7 @@ func.func @tensor_with_unknown_rank(%arg0: tensor<*xi8>) -> tensor<*xi8> {
 
 // check that tosa verify kick in
 func.func @test_avg_pool2d_zero_dim_input(%arg0: tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32> {
-  // expected-error@+1 {{'tosa.avg_pool2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+  // expected-error@+1 {{'tosa.avg_pool2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x0x?x9xf32>'}}
     %0 = "tosa.avg_pool2d"(%arg0) {acc_type = f32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
       : (tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32>
     return %0 : tensor<1x7x7x9xf32>
diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
index 8e19f87..2902c4a 100644
--- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir
+++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
@@ -80,14 +80,14 @@ func.func @transpose_fold_4d_int() -> tensor<3x1x4x2xi32> {
     [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
     [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
   ]]> : tensor<1x2x3x4xi32>} : () -> tensor<1x2x3x4xi32>
-  %perms = "tosa.const"() {value = dense<[2, 0, 3, 1]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %perms = "tosa.const"() {value = dense<[2, 0, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
   //               CHECK: %[[CST:.+]] = "tosa.const"() <{
   // CHECK-SAME{LITERAL}: value = dense<[
   // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
   // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
   // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
   // CHECK-SAME{LITERAL}: ]>
-  %1 = tosa.transpose %input, %perms : (tensor<1x2x3x4xi32>, tensor<4xi64>) -> tensor<3x1x4x2xi32>
+  %1 = tosa.transpose %input, %perms : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<3x1x4x2xi32>
   // CHECK: return %[[CST]]
   return %1 : tensor<3x1x4x2xi32>
 }
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 418f768..414bcfe 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -1,6 +1,22 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate=strict-op-spec-alignment
 
 
+func.func @test_const() -> tensor<1xf32> {
+  // expected-error@+1{{'tosa.const' op expected same attr/result element types}}
+  %0 = "tosa.const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
+
+// -----
+
+func.func @test_const_non_tensor_attr() {
+  // expected-error@+1{{tosa.const' op expected tensors for attr/result type}}
+  %0 = "tosa.const"() {value = dense<1.0> : vector<f32>} : () -> tensor<f32>
+  return
+}
+
+// -----
+
 func.func @test_conv2d(%arg0: tensor<1x29x29x4xf32>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
   // expected-error@+1 {{expect both input and weight to be float or not together, got 'f32' and 'i8'}}
   %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
@@ -148,6 +164,42 @@ func.func @test_transpose_invalid_permutation_tensor(%arg0: tensor<13x21x3xf32>)
 
 // -----
 
+func.func @test_transpose_invalid_permutation_negative(%arg0: tensor<3x2xi32>) -> tensor<*xi32> {
+  %perms = "tosa.const"() {value = dense<[-1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // expected-error@+1 {{'tosa.transpose' op expected valid permutation tensor}}
+  %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<*xi32>
+  return %1 : tensor<*xi32>
+}
+
+// -----
+
+func.func @test_transpose_invalid_permutation_tensor_above_range(%arg0: tensor<3x2xi32>) -> tensor<*xi32> {
+  %perms = "tosa.const"() {value = dense<[2, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // expected-error@+1 {{'tosa.transpose' op expected valid permutation tensor}}
+  %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<*xi32>
+  return %1 : tensor<*xi32>
+}
+
+// -----
+
+func.func @test_transpose_invalid_permutation_types(%arg0: tensor<3x2xi32>) -> tensor<3x4xi32> {
+  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 0 to match input dim 1 with value of 2}}
+  %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<3x4xi32>
+  return %1 : tensor<3x4xi32>
+}
+
+// -----
+
+func.func @test_transpose_invalid_permutation_types_dynamic_dim_ok(%arg0: tensor<2x?xi32>) -> tensor<3x4xi32> {
+  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 1 to match input dim 0 with value of 2}}
+  %1 = tosa.transpose %arg0, %perms : (tensor<2x?xi32>, tensor<2xi32>) -> tensor<3x4xi32>
+  return %1 : tensor<3x4xi32>
+}
+
+// -----
+
 func.func @test_fully_connected_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<2x3xf32>) -> tensor<273x2xf32> {
   %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
   %1 = tosa.reshape %arg0 {new_shape = array<i64: 273, 3>} : (tensor<13x21x3xf32>) -> tensor<273x3xf32>
@@ -269,7 +321,7 @@ func.func @test_reshape_type_mismatch(%arg0 : tensor<13x21x3xf32>) -> () {
 // -----
 
 func.func @test_reshape_static_zero_dim_input(%arg0 : tensor<13x0x3xf32>) -> () {
-  // expected-error@+1 {{'tosa.reshape' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+  // expected-error@+1 {{'tosa.reshape' op operand #0 must be tosa-conformant tensor of number values, but got 'tensor<13x0x3xf32>'}}
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 13, 21, 3>} : (tensor<13x0x3xf32>) -> tensor<13x0x3xf32>
   return
 }
@@ -277,7 +329,7 @@ func.func @test_reshape_static_zero_dim_input(%arg0 : tensor<13x0x3xf32>) -> ()
 // -----
 
 func.func @test_reshape_zero_dim_input(%arg0 : tensor<?x0x3xf32>) -> () {
-  // expected-error@+1 {{'tosa.reshape' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+  // expected-error@+1 {{'tosa.reshape' op operand #0 must be tosa-conformant tensor of number values, but got 'tensor<?x0x3xf32>'}}
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 13, 21, 3>} : (tensor<?x0x3xf32>) -> tensor<13x0x3xf32>
   return
 }
@@ -341,7 +393,7 @@ func.func @test_const_attribute_type_mismatch() -> tensor<100x100xf32> {
 // -----
 
 func.func @test_conv2d_static_zero_dim_input(%arg0: tensor<1x29x0x4xf32>, %arg1: tensor<16x3x3x4xf32>, %arg2: tensor<16xf32>) -> tensor<1x27x27x16xf32> {
-  // expected-error@+1 {{'tosa.conv2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+  // expected-error@+1 {{'tosa.conv2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x29x0x4xf32>'}}
   %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<1x29x0x4xf32>, tensor<16x3x3x4xf32>, tensor<16xf32>) -> tensor<1x27x27x16xf32>
   return %0 : tensor<1x27x27x16xf32>
@@ -350,8 +402,8 @@ func.func @test_conv2d_static_zero_dim_input(%arg0: tensor<1x29x0x4xf32>, %arg1:
 // -----
 
 func.func @test_conv2d_zero_dim_input(%arg0: tensor<1x?x0x4xf32>, %arg1: tensor<16x3x3x4xf32>, %arg2: tensor<16xf32>) -> tensor<1x27x27x16xf32> {
-  // expected-error@+1 {{'tosa.conv2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  // expected-error@+1 {{'tosa.conv2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x?x0x4xf32>'}}
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<1x?x0x4xf32>, tensor<16x3x3x4xf32>, tensor<16xf32>) -> tensor<1x27x27x16xf32>
   return %0 : tensor<1x27x27x16xf32>
 }
@@ -360,7 +412,7 @@ func.func @test_conv2d_zero_dim_input(%arg0: tensor<1x?x0x4xf32>, %arg1: tensor<
 // -----
 
 func.func @test_avg_pool2d_static_zero_dim_input(%arg0: tensor<1x0x7x9xf32>) -> tensor<1x7x7x9xf32> {
-  // expected-error@+1 {{'tosa.avg_pool2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+  // expected-error@+1 {{'tosa.avg_pool2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x0x7x9xf32>'}}
     %0 = "tosa.avg_pool2d"(%arg0) {acc_type = f32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
       : (tensor<1x0x7x9xf32>) -> tensor<1x7x7x9xf32>
     return %0 : tensor<1x7x7x9xf32>
@@ -369,7 +421,7 @@ func.func @test_avg_pool2d_static_zero_dim_input(%arg0: tensor<1x0x7x9xf32>) ->
 // -----
 
 func.func @test_avg_pool2d_zero_dim_input(%arg0: tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32> {
-  // expected-error@+1 {{'tosa.avg_pool2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+  // expected-error@+1 {{'tosa.avg_pool2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x0x?x9xf32>'}}
     %0 = "tosa.avg_pool2d"(%arg0) {acc_type = f32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
       : (tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32>
     return %0 : tensor<1x7x7x9xf32>
@@ -469,7 +521,7 @@ func.func @test_tile_io_rank_mismatch() {
 
 // CHECK-LABEL: @test_invalid_constant_permutation
 func.func @test_invalid_constant_permutation() {
-  // expected-error@+3 {{permutation must be within input bounds}}
+  // expected-error@+3 {{'tosa.transpose' op expected valid permutation tensor}}
   %0 = tensor.empty() : tensor<3x4x5xi32>
   %1 = arith.constant dense<[3, 0, 1]> : tensor<3xi32>
   %2 = tosa.transpose %0, %1 : (tensor<3x4x5xi32>, tensor<3xi32>) -> tensor<3x4x5xi32>
@@ -480,7 +532,7 @@ func.func @test_invalid_constant_permutation() {
 
 // CHECK-LABEL: test_rank_size_constant_permutation
 func.func @test_rank_size_constant_permutation() {
-  // expected-error@+4 {{permutation must be within input bounds}}
+  // expected-error@+4 {{'tosa.transpose' op expected valid permutation tensor}}
   %0 = arith.constant 6 : index
   %1 = arith.constant dense<[0, 2]> : tensor<2xi32>
   %2 = tensor.empty(%0) : tensor<?x27xi64>
@@ -492,7 +544,7 @@ func.func @test_rank_size_constant_permutation() {
 
 // CHECK-LABEL: test_large_constant_permutation
 func.func @test_large_constant_permutation() {
-  // expected-error@+4 {{permutation must be within input bounds}}
+  // expected-error@+4 {{'tosa.transpose' op expected valid permutation tensor}}
   %0 = arith.constant 6 : index
   %1 = arith.constant dense<[1185677355, 332462212]> : tensor<2xi32>
   %2 = tensor.empty(%0) : tensor<?x27xi64>
@@ -504,7 +556,7 @@ func.func @test_large_constant_permutation() {
 
 // CHECK-LABEL: test_table_rank0_table
 func.func @test_table_rank0_table(%arg0: tensor<64xi16>, %arg1: tensor<i16>) {
-  // expected-error@+1 {{'tosa.table' op operand #1 must be 1-d tensor, but got 'tensor<i16>'}}
+  // expected-error@+1 {{'tosa.table' op operand #1 must be 1-d tosa-conformant tensor, but got 'tensor<i16>'}}
   %0 = tosa.table %arg0, %arg1 : (tensor<64xi16>, tensor<i16>) -> tensor<64xi16>
   return
 }
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 525ee91..a1600fd 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -574,6 +574,22 @@ func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> {
 }
 
 // -----
+// CHECK-LABEL: transpose_dynamic_dim
+func.func @test_transpose_dynamic_dim(%arg0: tensor<13x?x3xf32>) -> tensor<3x13x?xf32> {
+  %0 = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %1 = tosa.transpose %arg0, %0 : (tensor<13x?x3xf32>, tensor<3xi32>) -> tensor<3x13x?xf32>
+  return %1 : tensor<3x13x?xf32>
+}
+
+// -----
+// CHECK-LABEL: transpose_half_dynamic_dim
+func.func @test_transpose_half_dynamic_dim(%arg0: tensor<13x3x3xf32>) -> tensor<3x13x?xf32> {
+  %0 = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %1 = tosa.transpose %arg0, %0 : (tensor<13x3x3xf32>, tensor<3xi32>) -> tensor<3x13x?xf32>
+  return %1 : tensor<3x13x?xf32>
+}
+
+// -----
 // CHECK-LABEL: gather
 func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x26xi32>) -> tensor<13x26x3xf32> {
   %0 = tosa.gather %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x26xi32>) -> tensor<13x26x3xf32>
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index f3d3c74..b86fcac 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -273,6 +273,16 @@ libc_support_library(
 )
 
 libc_support_library(
+    name = "__support_macros_null_check",
+    hdrs = ["src/__support/macros/null_check.h"],
+    deps = [
+        ":__support_macros_config",
+        ":__support_macros_optimization",
+        ":__support_macros_sanitizer",
+    ],
+)
+
+libc_support_library(
     name = "__support_common",
     hdrs = [
         "src/__support/common.h",
@@ -665,6 +675,7 @@ libc_support_library(
         ":__support_ctype_utils",
         ":__support_fputil_fp_bits",
         ":__support_fputil_rounding_mode",
+        ":__support_macros_null_check",
         ":__support_str_to_integer",
         ":__support_str_to_num_result",
         ":__support_uint128",
author	Xiaofeng Tian <110771974+txff99@users.noreply.github.com>	2024-09-11 14:00:54 -0700
committer	Vitaly Buka <vitalybuka@google.com>	2024-09-11 14:00:54 -0700
commit	8ee4ba92e1a85fd220caf2f83d5d86c334d432db (patch)
tree	847bf2e4f29d84e1369a9d4ef2d1e69c637a36de
parent	01006d0bc93e5c6a6953ea8874eadb56ec719399 (diff)
parent	c2b93e0671d8cfd6b1a24c6e1d7be290125b8974 (diff)
download	llvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.zip llvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.tar.gz llvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.tar.bz2